Spaces:

uva-cv-lab
/

SAB3R

Running on Zero

App Files Files Community

Xuweiyi commited on 19 days ago

Commit

c7b663e

verified ·

1 Parent(s): 4c3c562

Initial SAB3R demo release

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +11 -0
.gitignore +148 -0
LICENSE +7 -0
README.md +185 -6
app.py +18 -0
assets/network_architecture.png +3 -0
assets/qualitative_2.jpg +3 -0
assets/teaser.jpg +3 -0
assets/teaser_v5.jpg +3 -0
config/deepspeed.json +38 -0
config/training_config.yaml +55 -0
config/training_config_full.yaml +57 -0
demo.py +118 -0
docker/docker-compose-cpu.yml +16 -0
docker/docker-compose-cuda.yml +23 -0
docker/files/cpu.Dockerfile +39 -0
docker/files/cuda.Dockerfile +29 -0
docker/files/entrypoint.sh +8 -0
docker/run.sh +68 -0
dust3r/.gitignore +132 -0
dust3r/LICENSE +7 -0
dust3r/NOTICE +12 -0
dust3r/README.md +390 -0
dust3r/assets/demo.jpg +3 -0
dust3r/assets/dust3r.jpg +0 -0
dust3r/assets/dust3r_archi.jpg +0 -0
dust3r/assets/matching.jpg +3 -0
dust3r/assets/pipeline1.jpg +0 -0
dust3r/croco/LICENSE +52 -0
dust3r/croco/NOTICE +21 -0
dust3r/croco/README.MD +124 -0
dust3r/croco/assets/Chateau1.png +3 -0
dust3r/croco/assets/Chateau2.png +3 -0
dust3r/croco/assets/arch.jpg +0 -0
dust3r/croco/croco-stereo-flow-demo.ipynb +191 -0
dust3r/croco/datasets_croco/__init__.py +0 -0
dust3r/croco/datasets_croco/crops/README.MD +104 -0
dust3r/croco/datasets_croco/crops/extract_crops_from_images.py +159 -0
dust3r/croco/datasets_croco/habitat_sim/README.MD +76 -0
dust3r/croco/datasets_croco/habitat_sim/__init__.py +0 -0
dust3r/croco/datasets_croco/habitat_sim/generate_from_metadata.py +92 -0
dust3r/croco/datasets_croco/habitat_sim/generate_from_metadata_files.py +27 -0
dust3r/croco/datasets_croco/habitat_sim/generate_multiview_images.py +177 -0
dust3r/croco/datasets_croco/habitat_sim/multiview_habitat_sim_generator.py +390 -0
dust3r/croco/datasets_croco/habitat_sim/pack_metadata_files.py +69 -0
dust3r/croco/datasets_croco/habitat_sim/paths.py +129 -0
dust3r/croco/datasets_croco/pairs_dataset.py +109 -0
dust3r/croco/datasets_croco/transforms.py +95 -0
dust3r/croco/demo.py +55 -0
dust3r/croco/interactive_demo.ipynb +271 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/network_architecture.png filter=lfs diff=lfs merge=lfs -text
+assets/qualitative_2.jpg filter=lfs diff=lfs merge=lfs -text
+assets/teaser.jpg filter=lfs diff=lfs merge=lfs -text
+assets/teaser_v5.jpg filter=lfs diff=lfs merge=lfs -text
+dust3r/assets/demo.jpg filter=lfs diff=lfs merge=lfs -text
+dust3r/assets/matching.jpg filter=lfs diff=lfs merge=lfs -text
+dust3r/croco/assets/Chateau1.png filter=lfs diff=lfs merge=lfs -text
+dust3r/croco/assets/Chateau2.png filter=lfs diff=lfs merge=lfs -text
+dust3r/croco/models/curope/build/temp.linux-x86_64-cpython-311/.ninja_deps filter=lfs diff=lfs merge=lfs -text
+dust3r/croco/models/curope/build/temp.linux-x86_64-cpython-311/curope.o filter=lfs diff=lfs merge=lfs -text
+dust3r/croco/models/curope/build/temp.linux-x86_64-cpython-311/kernels.o filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,148 @@

+*gl-outputs/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+launch_script/output/
+preprocess_data/
+images
+launch_script/
+paper/
+.gradio/
+semantic_extraction_ade20k*
+semantic_extraction_voc*
+semantic_seg_ade20k_clip*
+zero_shot_semantic_seg_ade20k_inference*
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# data
+data/
+# checkpoints
+checkpoints/
+# wandb
+wandb/
+# outputs
+outputs/

LICENSE ADDED Viewed

	@@ -0,0 +1,7 @@

+DUSt3R, Copyright (c) 2024-present Naver Corporation, is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 license.
+A summary of the CC BY-NC-SA 4.0 license is located here:
+	https://creativecommons.org/licenses/by-nc-sa/4.0/
+The CC BY-NC-SA 4.0 license is located here:
+	https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode

README.md CHANGED Viewed

@@ -1,13 +1,192 @@
 ---
 title: SAB3R
-emoji: 🐢
-colorFrom: green
-colorTo: gray
 sdk: gradio
-sdk_version: 6.13.0
-python_version: '3.12'
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: SAB3R
+emoji: 🌐
+colorFrom: blue
+colorTo: green
 sdk: gradio
+sdk_version: 4.44.1
 app_file: app.py
 pinned: false
+license: cc-by-nc-sa-4.0
 ---
+# SAB3R: Semantic-Augmented Backbone in 3D Reconstruction
+<div align="center">
+**3D-LLM/VLA Workshop @ CVPR 2025**
+[**Xuweiyi Chen**](https://xuweiyichen.github.io/)<sup>*,1</sup> · [**Tian Xia**](https://tianx-ia.github.io/)<sup>*,2</sup> · [**Sihan Xu**](https://sihanxu.github.io/)<sup>2</sup> · [**Jed Jianing Yang**](https://jedyang.com/)<sup>2</sup> · [**Joyce Chai**](https://web.eecs.umich.edu/~chaijy/)<sup>2</sup> · [**Zezhou Cheng**](https://sites.google.com/site/zezhoucheng/)<sup>1</sup>
+<sup>1</sup>University of Virginia · <sup>2</sup>University of Michigan
+<sup>*</sup>Denotes Equal Contribution
+---
+[![Paper](https://img.shields.io/badge/arXiv-2506.02112-b31b1b.svg)](https://www.arxiv.org/abs/2506.02112)
+[![Project Page](https://img.shields.io/badge/Project-Page-orange)](https://uva-computer-vision-lab.github.io/sab3r/)
+[![Demo](https://img.shields.io/badge/🤗-Live%20Demo-yellow)](https://huggingface.co/spaces/uva-cv-lab/SAB3R)
+[![Data](https://img.shields.io/badge/Dataset-Coming%20Soon-blue)](#)
+[![Code](https://img.shields.io/badge/GitHub-Code-green)](https://github.com/uva-computer-vision-lab/sab3r)
+</div>
+---
+![SAB3R Teaser](./assets/teaser_v5.jpg)
+*Given an unposed input video, we show ground truth for: open-vocabulary semantic segmentation (per-pixel labels for the prompt "a black office chair"), 3D reconstruction (ground-truth point cloud), and the proposed **Map and Locate** task (open-vocabulary segmentation and point cloud). The Map and Locate task: (1) encompasses both 2D and 3D tasks, (2) bridges reconstruction and recognition, and (3) introduces practical questions in robotics and embodied AI.*
+## Release Plan
+- [x] Demo Release
+- [x] Training and Inference Code Release
+- [ ] Release Map and Locate Dataset
+## Abstract
+We introduce **Map and Locate**, a task that unifies open-vocabulary segmentation and 3D reconstruction from unposed videos. Our method, **SAB3R**, builds upon MASt3R and incorporates lightweight distillation from CLIP and DINOv2 to generate semantic point clouds in a single forward pass. SAB3R achieves superior performance compared to separate deployment of MASt3R and CLIP on the Map and Locate benchmark.
+## Network Architecture
+![SAB3R Architecture](assets/network_architecture.png)
+**SAB3R** distills dense features from CLIP and DINO into the MASt3R framework, enriching it with 2D semantic understanding. Each encoder-decoder pair operates on multi-view images, sharing weights and exchanging information to ensure consistent feature extraction across views. The model simultaneously generates depth, dense DINOv2, and dense CLIP features, which are then used for multi-view 3D reconstruction and semantic segmentation. This architecture enables SAB3R to seamlessly integrate 2D and 3D representations, achieving both geometric and semantic comprehension in a unified model.
+## Installation
+1. Clone the repository:
+```bash
+git clone --recursive https://github.com/uva-computer-vision-lab/sab3r
+cd sab3r
+# if you have already cloned:
+# git submodule update --init --recursive
+```
+2. Create the environment:
+```bash
+conda create -n sab3r python=3.11 cmake=3.14.0
+conda activate sab3r
+conda install pytorch torchvision pytorch-cuda=12.1 -c pytorch -c nvidia
+pip install -r requirements.txt
+# FeatUp (not on PyPI) — required for the CLIP/DINO semantic heads.
+pip install git+https://github.com/mhamilton723/FeatUp
+```
+3. (Optional) Compile RoPE CUDA kernels for faster inference:
+```bash
+cd dust3r/croco/models/curope/
+python setup.py build_ext --inplace
+cd ../../../../
+```
+4. (Optional) Pre-download the CLIP BPE vocab (the demo will fetch it on first run):
+```bash
+mkdir -p ~/.cache/clip
+cd ~/.cache/clip
+wget https://github.com/openai/CLIP/raw/main/clip/bpe_simple_vocab_16e6.txt.gz
+```
+## Demo
+The demo launches a Gradio UI for 3D reconstruction + open-vocabulary text queries.
+**Checkpoint from HF Hub (default)**
+```bash
+python demo.py \
+    --model_name MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric \
+    --local_network --share
+```
+This downloads `demo_ckpt/base/base.pt` from [`uva-cv-lab/SAB3R`](https://huggingface.co/uva-cv-lab/SAB3R) on first launch and caches it in `~/.cache/huggingface/`.
+**Local checkpoint**
+```bash
+python demo.py \
+    --model_name MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric \
+    --weights /path/to/your.pt \
+    --local_network --share
+```
+**Override the HF Hub repo / filename**
+```bash
+python demo.py \
+    --model_name MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric \
+    --model_repo your-org/your-sab3r-ckpt \
+    --ckpt_filename model.pt
+```
+**Local dev with a checkpoint dropdown** — if you keep multiple checkpoints under a directory (one sub-directory per checkpoint, each holding `<name>.pt`), pass `--checkpoint_dir`:
+```bash
+python demo.py --checkpoint_dir /path/to/ckpt_root --local_network --share
+```
+## Hugging Face Spaces
+This repo ships with an `app.py` entry point for Hugging Face Spaces. To deploy:
+1. Create a Gradio Space at https://huggingface.co/new-space (SDK: `gradio`).
+2. Upload the SAB3R checkpoint to a model repo (default expected: `uva-cv-lab/SAB3R`, filename `base.pt`). To use a different repo, set the `SAB3R_MODEL_REPO` / `SAB3R_CKPT_FILENAME` env vars on the Space, or edit `demo.py`'s defaults.
+3. Push this repo (minus the `env/` conda env and the heavyweight submodule binaries) to the Space. The Space's `README.md` must begin with the YAML frontmatter below; add it to your Space copy of this README before pushing:
+```yaml
+---
+title: SAB3R
+emoji: 🌐
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: 4.44.1
+app_file: app.py
+pinned: false
+license: cc-by-nc-sa-4.0
+---
+```
+FeatUp needs to be installed from GitHub, which Spaces handles via `requirements.txt` — the one in this repo already leaves a comment pointing at the install line; uncomment it for the Space:
+```txt
+git+https://github.com/mhamilton723/FeatUp
+```
+## Training
+Two canonical configs are provided under `config/`:
+- `training_config.yaml` — minimal dev recipe (CLIP distillation on a Co3D subset).
+- `training_config_full.yaml` — full paper recipe (CLIP + DINO distillation on Habitat + ScanNet++ + ARKitScenes + Co3D).
+Both reference paths relative to the repo root (e.g. `./data`, `./checkpoints`, `./outputs`); override them via Hydra:
+```bash
+torchrun --nproc_per_node=8 train.py \
+    --config-name training_config_full \
+    dataset_url=/path/to/data \
+    output_url=/path/to/outputs
+```
+Set `WANDB_API_KEY` in your shell (do **not** commit it) if you want experiment tracking.
+## Citation
+```bibtex
+@article{chen2025sab3rsemanticaugmentedbackbone3d,
+  title={SAB3R: Semantic-Augmented Backbone in 3D Reconstruction},
+  author={Xuweiyi Chen and Tian Xia and Sihan Xu and Jianing Yang and Joyce Chai and Zezhou Cheng},
+  year={2025},
+  eprint={2506.02112},
+  archivePrefix={arXiv},
+  primaryClass={cs.CV},
+  url={https://arxiv.org/abs/2506.02112},
+}
+```
+## Acknowledgments
+This work builds upon [MASt3R](https://github.com/naver/mast3r), [DUSt3R](https://github.com/naver/dust3r) and [FeatUp](https://github.com/mhamilton723/FeatUp). We thank the original authors for their excellent work and open-source contributions.
+## License
+The code is distributed under the CC BY-NC-SA 4.0 License.
+See [LICENSE](LICENSE) for more information.

app.py ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/usr/bin/env python3
+# --------------------------------------------------------
+# Hugging Face Spaces entry point for the SAB3R demo.
+#
+# Spaces looks for `app.py` by default. This wrapper just sets
+# Spaces-appropriate defaults and delegates to demo.py. For local
+# dev, run demo.py directly (it exposes --share, --local_network, etc).
+# --------------------------------------------------------
+import os
+from demo import main
+if __name__ == "__main__":
+    main([
+        "--model_name", "MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric",
+        "--server_name", "0.0.0.0",
+        "--server_port", os.environ.get("GRADIO_SERVER_PORT", "7860"),
+    ])

assets/network_architecture.png ADDED Viewed

Git LFS Details

SHA256: cc03b7cf5029a5a635105e68ae53058daefcb9686e2006023cbaaa1e3991db2a
Pointer size: 131 Bytes
Size of remote file: 183 kB

assets/qualitative_2.jpg ADDED Viewed

Git LFS Details

SHA256: f3a0fabcabff6320ed89f0388f1133e248c620b7d45ceb933b7fdd445bc6c450
Pointer size: 131 Bytes
Size of remote file: 158 kB

assets/teaser.jpg ADDED Viewed

Git LFS Details

SHA256: 50a04659d29edb419d46ddc836330e76bb8106f60eae79657972ad9cad439f55
Pointer size: 132 Bytes
Size of remote file: 2.08 MB

assets/teaser_v5.jpg ADDED Viewed

Git LFS Details

SHA256: c58c2ebb4a8e0b20bfb2e9a9a02f1d8660e650c2f667c89c3d508cda1034a521
Pointer size: 131 Bytes
Size of remote file: 369 kB

config/deepspeed.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+    "train_micro_batch_size_per_gpu": 1,
+    "gradient_accumulation_steps": 8,
+    "steps_per_print": 1,
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": 1e-4,
+            "betas": [0.9, 0.95],
+            "weight_decay": 0.05
+        }
+    },
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": 0.0,
+            "warmup_max_lr": 1e-4,
+            "warmup_num_steps": 4000
+        }
+    },
+    "gradient_clipping": 1.0,
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "reduce_bucket_size": 5e7
+    },
+    "activation_checkpointing": {
+        "partition_activations": true,
+        "contiguous_memory_optimization": true,
+        "cpu_checkpointing": true,
+        "number_checkpoints": 2
+    }
+}

config/training_config.yaml ADDED Viewed

	@@ -0,0 +1,55 @@

+seed: 8
+# Override these paths via hydra (e.g. `python train.py dataset_url=/path/to/data`)
+# or by editing this file.
+dataset_url: "./data"
+code_url: "."
+output_url: "./outputs"
+model:
+  name: "AsymmetricMASt3R(pos_embed='RoPE100', patch_embed_cls='ManyAR_PatchEmbed', img_size=(512, 512), head_type='catmlp+dpt', output_mode='pts3d+desc24', clip_head_type='dpt', dino_head_type=None, depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12, two_confs=True)"
+  pretrained: "checkpoints/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth"
+clip_checkpoint_weights_path: "checkpoints/maskclip_jbu_stack_cocostuff.ckpt"
+dino_checkpoint_weights_path: "checkpoints/dinov2_jbu_stack_cocostuff.ckpt"
+train_criterion: "ConfFeatLoss(Regr3D(L21, norm_mode='?avg_dis'), alpha=0.2, beta=0.4, gamma=0.4, need_clip=True, need_dino=False, clip_checkpoint_weights_path='${clip_checkpoint_weights_path}', dino_checkpoint_weights_path='${dino_checkpoint_weights_path}') + 0.075*ConfMatchingLoss(MatchingLoss(InfoNCE(mode='proper', temperature=0.05), negatives_padding=0, blocksize=8192), alpha=10.0, confmode='mean')"
+test_criterion: "FeatRegr3D_ScaleShiftInv(L21, norm_mode='?avg_dis', need_clip=True, need_dino=False, gt_scale=True, sky_loss_value=0, clip_checkpoint_weights_path='${clip_checkpoint_weights_path}', dino_checkpoint_weights_path='${dino_checkpoint_weights_path}') + -1.*MatchingLoss(APLoss(nq='torch', fp=torch.float16), negatives_padding=12288)"
+dataset:
+  train: "1000 @ Co3d(split='train', ROOT='${dataset_url}/co3d_subset_processed', aug_crop='auto', aug_monocular=0.005, aug_rot90='diff', mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], n_corres=8192, nneg=0.5, transform=ColorJitter)"
+  test: "100 @ Co3d(split='test', ROOT='${dataset_url}/co3d_subset_processed', resolution=(512,384), n_corres=1024, seed=777)"
+training:
+  epochs: 200
+  disable_cudnn_benchmark: true
+  print_freq: 1
+saving:
+  save_freq: 10
+  keep_freq: 40
+  eval_freq: 10
+output_dir: "${output_url}/sab3r"
+wandb:
+  project_name: "sab3r"
+  entity: "uva-computer-vision-lab"
+  group: "sab3r"
+num_workers: 8
+dist_url: "env://"
+distributed: True
+world_size: -1
+gpu: -1
+rank: -1
+dist_backend: "nccl"
+resume: ""
+start_epoch: 0
+disable_cudnn_benchmark: True
+amp: 1
+deepspeed_config: "${code_url}/config/deepspeed.json"

config/training_config_full.yaml ADDED Viewed

	@@ -0,0 +1,57 @@

+seed: 8
+# Full SAB3R training recipe (CLIP + DINO distillation on Habitat + ScanNet++ + ARKitScenes + Co3D).
+# Override these paths via hydra (e.g. `python train.py --config-name training_config_full dataset_url=/path/to/data`)
+# or by editing this file.
+dataset_url: "./data"
+code_url: "."
+output_url: "./outputs"
+model:
+  name: "AsymmetricMASt3R(pos_embed='RoPE100', patch_embed_cls='ManyAR_PatchEmbed', img_size=(512, 512), head_type='catmlp+dpt', output_mode='pts3d+desc24', clip_head_type='dpt', dino_head_type='dpt', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12, two_confs=True)"
+  pretrained: "checkpoints/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth"
+clip_checkpoint_weights_path: "checkpoints/maskclip_jbu_stack_cocostuff.ckpt"
+dino_checkpoint_weights_path: "checkpoints/dinov2_jbu_stack_cocostuff.ckpt"
+train_criterion: "ConfFeatLoss(Regr3D(L21, norm_mode='?avg_dis'), alpha=1, beta=20, gamma=4, need_clip=True, need_dino=True, need_mask=False, clip_checkpoint_weights_path='${clip_checkpoint_weights_path}', dino_checkpoint_weights_path='${dino_checkpoint_weights_path}') + 0.75*ConfMatchingLoss(MatchingLoss(InfoNCE(mode='proper', temperature=0.05), negatives_padding=0, blocksize=8192), alpha=10.0, confmode='mean')"
+test_criterion: "FeatRegr3D_ScaleShiftInv(L21, norm_mode='?avg_dis', need_clip=True, need_dino=True, gt_scale=True, sky_loss_value=0, clip_checkpoint_weights_path='${clip_checkpoint_weights_path}', dino_checkpoint_weights_path='${dino_checkpoint_weights_path}') + -1.*MatchingLoss(APLoss(nq='torch', fp=torch.float16), negatives_padding=12288)"
+dataset:
+  train: "57_000 @ Habitat512(1_000_000, split='train', ROOT='${dataset_url}/habitat_processed', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 45_600 @ ScanNetpp(split='train', ROOT='${dataset_url}/scannetpp_processed', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 45_600 @ ARKitScenes(split='train', ROOT='${dataset_url}/arkitscenes_processed', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], aug_crop='auto', aug_monocular=0.005, transform=ColorJitter, n_corres=8192, nneg=0.5) + 22_800 @ Co3d(split='train', ROOT='${dataset_url}/co3d_50_seqs_per_category_subset_processed', aug_crop='auto', aug_monocular=0.005, aug_rot90='diff', mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], n_corres=8192, nneg=0.5, transform=ColorJitter)"
+  test: "100 @ Co3d(split='test', ROOT='${dataset_url}/co3d_50_seqs_per_category_subset_processed', resolution=(512,384), n_corres=1024, seed=777)"
+training:
+  epochs: 5
+  disable_cudnn_benchmark: true
+  print_freq: 1
+saving:
+  save_freq: 1
+  keep_freq: 1
+  eval_freq: 1
+  save_steps: 5000
+  plot_steps: 200
+output_dir: "${output_url}/sab3r_full"
+wandb:
+  project_name: "sab3r"
+  entity: "uva-computer-vision-lab"
+  group: "sab3r_full"
+num_workers: 8
+dist_url: "env://"
+distributed: True
+world_size: -1
+gpu: -1
+rank: -1
+dist_backend: "nccl"
+resume: ""
+start_epoch: 0
+disable_cudnn_benchmark: True
+amp: 0
+deepspeed_config: "${code_url}/config/deepspeed.json"

demo.py ADDED Viewed

	@@ -0,0 +1,118 @@

+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# SAB3R gradio demo executable
+# --------------------------------------------------------
+import os
+import argparse
+import tempfile
+from contextlib import nullcontext
+import torch
+from huggingface_hub import hf_hub_download
+from mast3r.demo import get_args_parser as sab3r_get_args_parser, main_demo
+from mast3r.model import AsymmetricMASt3R  # noqa: F401  (referenced via eval() below)
+from mast3r.utils.misc import hash_md5
+import mast3r.utils.path_to_dust3r  # noqa: F401  (side-effect: puts vendored dust3r on sys.path)
+from dust3r.demo import set_print_with_timestamp
+import matplotlib.pyplot as pl
+pl.ion()
+torch.backends.cuda.matmul.allow_tf32 = True  # for gpu >= Ampere and pytorch >= 1.12
+inf = float("inf")
+DEFAULT_MODEL_REPO = "uva-cv-lab/SAB3R"
+DEFAULT_CKPT_FILENAME = "demo_ckpt/base/base.pt"
+def get_args_parser():
+    parser = sab3r_get_args_parser()
+    parser.add_argument(
+        "--model_repo",
+        default=os.environ.get("SAB3R_MODEL_REPO", DEFAULT_MODEL_REPO),
+        help="Hugging Face Hub repo id hosting the SAB3R checkpoint "
+             "(used only when --weights is not provided).",
+    )
+    parser.add_argument(
+        "--ckpt_filename",
+        default=os.environ.get("SAB3R_CKPT_FILENAME", DEFAULT_CKPT_FILENAME),
+        help="Checkpoint filename inside --model_repo.",
+    )
+    parser.add_argument(
+        "--checkpoint_dir",
+        default=os.environ.get("SAB3R_CHECKPOINT_DIR", None),
+        help="Optional local directory containing one sub-directory per "
+             "checkpoint (each sub-dir must hold `<name>.pt`). When provided, "
+             "the UI exposes a dropdown to switch between them. Useful for "
+             "local dev; leave unset for single-checkpoint HF Spaces deployments.",
+    )
+    return parser
+def load_weights(model, ckp_path, device):
+    ckp = torch.load(ckp_path, map_location='cpu')
+    if ckp_path.endswith('.pth'):
+        model.load_state_dict(ckp['model'], strict=False)
+    elif ckp_path.endswith('.pt'):
+        model.load_state_dict(ckp['module'])
+    else:
+        raise ValueError(f"Unknown checkpoint format: {ckp_path}")
+    model.to(device)
+def build_model_config():
+    return (
+        "AsymmetricMASt3R(pos_embed='RoPE100', patch_embed_cls='PatchEmbedDust3R', "
+        "img_size=(512, 512), head_type='catmlp+dpt', output_mode='pts3d+desc24', "
+        "clip_head_type='dpt', dino_head_type='dpt', "
+        "depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), "
+        "enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, "
+        "dec_embed_dim=768, dec_depth=12, dec_num_heads=12, "
+        "two_confs=True, landscape_only=False)"
+    )
+def resolve_weights_path(args: argparse.Namespace) -> str:
+    if args.weights:
+        return args.weights
+    print(f"[sab3r] Downloading checkpoint from HF Hub: {args.model_repo}/{args.ckpt_filename}")
+    return hf_hub_download(repo_id=args.model_repo, filename=args.ckpt_filename)
+def main(argv=None):
+    parser = get_args_parser()
+    args = parser.parse_args(argv)
+    set_print_with_timestamp()
+    if args.server_name is not None:
+        server_name = args.server_name
+    else:
+        server_name = '0.0.0.0' if args.local_network else '127.0.0.1'
+    model = eval(build_model_config())
+    ckp_path = resolve_weights_path(args)
+    load_weights(model, ckp_path, args.device)
+    chkpt_tag = hash_md5(ckp_path)
+    def get_context(tmp_dir):
+        return (tempfile.TemporaryDirectory(suffix='_sab3r_gradio_demo') if tmp_dir is None
+                else nullcontext(tmp_dir))
+    with get_context(args.tmp_dir) as tmpdirname:
+        cache_path = os.path.join(tmpdirname, chkpt_tag)
+        os.makedirs(cache_path, exist_ok=True)
+        main_demo(
+            cache_path, model, args.device, args.image_size, server_name, args.server_port,
+            silent=args.silent, share=args.share, gradio_delete_cache=args.gradio_delete_cache,
+            checkpoint_dir=args.checkpoint_dir,
+        )
+if __name__ == '__main__':
+    main()

docker/docker-compose-cpu.yml ADDED Viewed

	@@ -0,0 +1,16 @@

+version: '3.8'
+services:
+  mast3r-demo:
+    build:
+      context: ./files
+      dockerfile: cpu.Dockerfile
+    ports:
+      - "7860:7860"
+    volumes:
+      - ./files/checkpoints:/mast3r/checkpoints
+    environment:
+      - DEVICE=cpu
+      - MODEL=${MODEL:-MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth}
+    cap_add:
+      - IPC_LOCK
+      - SYS_RESOURCE

docker/docker-compose-cuda.yml ADDED Viewed

	@@ -0,0 +1,23 @@

+version: '3.8'
+services:
+  mast3r-demo:
+    build:
+      context: ./files
+      dockerfile: cuda.Dockerfile
+    ports:
+      - "7860:7860"
+    environment:
+      - DEVICE=cuda
+      - MODEL=${MODEL:-MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth}
+    volumes:
+      - ./files/checkpoints:/mast3r/checkpoints
+    cap_add:
+      - IPC_LOCK
+      - SYS_RESOURCE
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]

docker/files/cpu.Dockerfile ADDED Viewed

	@@ -0,0 +1,39 @@

+FROM python:3.11-slim
+LABEL description="Docker container for MASt3R with dependencies installed. CPU VERSION"
+ENV DEVICE="cpu"
+ENV MODEL="MASt3R_ViTLarge_BaseDecoder_512_dpt.pth"
+ARG DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y \
+    git \
+    libgl1-mesa-glx \
+    libegl1-mesa \
+    libxrandr2 \
+    libxrandr2 \
+    libxss1 \
+    libxcursor1 \
+    libxcomposite1 \
+    libasound2 \
+    libxi6 \
+    libxtst6 \
+    libglib2.0-0 \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+RUN git clone --recursive https://github.com/naver/mast3r /mast3r
+WORKDIR /mast3r/dust3r
+RUN pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
+RUN pip install -r requirements.txt
+RUN pip install -r requirements_optional.txt
+RUN pip install opencv-python==4.8.0.74
+WORKDIR /mast3r
+RUN pip install -r requirements.txt
+COPY entrypoint.sh /entrypoint.sh
+RUN chmod +x /entrypoint.sh
+ENTRYPOINT ["/entrypoint.sh"]

docker/files/cuda.Dockerfile ADDED Viewed

	@@ -0,0 +1,29 @@

+FROM nvcr.io/nvidia/pytorch:24.01-py3
+LABEL description="Docker container for MASt3R with dependencies installed. CUDA VERSION"
+ENV DEVICE="cuda"
+ENV MODEL="MASt3R_ViTLarge_BaseDecoder_512_dpt.pth"
+ARG DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y \
+    git=1:2.34.1-1ubuntu1.10 \
+    libglib2.0-0=2.72.4-0ubuntu2.2 \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+RUN git clone --recursive https://github.com/naver/mast3r /mast3r
+WORKDIR /mast3r/dust3r
+RUN pip install -r requirements.txt
+RUN pip install -r requirements_optional.txt
+RUN pip install opencv-python==4.8.0.74
+WORKDIR /mast3r/dust3r/croco/models/curope/
+RUN python setup.py build_ext --inplace
+WORKDIR /mast3r
+RUN pip install -r requirements.txt
+COPY entrypoint.sh /entrypoint.sh
+RUN chmod +x /entrypoint.sh
+ENTRYPOINT ["/entrypoint.sh"]

docker/files/entrypoint.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+#!/bin/bash
+set -eux
+DEVICE=${DEVICE:-cuda}
+MODEL=${MODEL:-MASt3R_ViTLarge_BaseDecoder_512_dpt.pth}
+exec python3 demo.py --weights "checkpoints/$MODEL" --device "$DEVICE" --local_network "$@"

docker/run.sh ADDED Viewed

	@@ -0,0 +1,68 @@

+#!/bin/bash
+set -eux
+# Default model name
+model_name="MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth"
+check_docker() {
+    if ! command -v docker &>/dev/null; then
+        echo "Docker could not be found. Please install Docker and try again."
+        exit 1
+    fi
+}
+download_model_checkpoint() {
+    if [ -f "./files/checkpoints/${model_name}" ]; then
+        echo "Model checkpoint ${model_name} already exists. Skipping download."
+        return
+    fi
+    echo "Downloading model checkpoint ${model_name}..."
+    wget "https://download.europe.naverlabs.com/ComputerVision/MASt3R/${model_name}" -P ./files/checkpoints
+}
+set_dcomp() {
+    if command -v docker-compose &>/dev/null; then
+        dcomp="docker-compose"
+    elif command -v docker &>/dev/null && docker compose version &>/dev/null; then
+        dcomp="docker compose"
+    else
+        echo "Docker Compose could not be found. Please install Docker Compose and try again."
+        exit 1
+    fi
+}
+run_docker() {
+    export MODEL=${model_name}
+    if [ "$with_cuda" -eq 1 ]; then
+        $dcomp -f docker-compose-cuda.yml up --build
+    else
+        $dcomp -f docker-compose-cpu.yml up --build
+    fi
+}
+with_cuda=0
+for arg in "$@"; do
+    case $arg in
+        --with-cuda)
+            with_cuda=1
+            ;;
+        --model_name=*)
+            model_name="${arg#*=}.pth"
+            ;;
+        *)
+            echo "Unknown parameter passed: $arg"
+            exit 1
+            ;;
+    esac
+done
+main() {
+    check_docker
+    download_model_checkpoint
+    set_dcomp
+    run_docker
+}
+main

dust3r/.gitignore ADDED Viewed

	@@ -0,0 +1,132 @@

+data/
+checkpoints/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/

dust3r/LICENSE ADDED Viewed

	@@ -0,0 +1,7 @@

+DUSt3R, Copyright (c) 2024-present Naver Corporation, is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 license.
+A summary of the CC BY-NC-SA 4.0 license is located here:
+	https://creativecommons.org/licenses/by-nc-sa/4.0/
+The CC BY-NC-SA 4.0 license is located here:
+	https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode

dust3r/NOTICE ADDED Viewed

	@@ -0,0 +1,12 @@

+DUSt3R
+Copyright 2024-present NAVER Corp.
+This project contains subcomponents with separate copyright notices and license terms.
+Your use of the source code for these subcomponents is subject to the terms and conditions of the following licenses.
+====
+naver/croco
+https://github.com/naver/croco/
+Creative Commons Attribution-NonCommercial-ShareAlike 4.0

dust3r/README.md ADDED Viewed

	@@ -0,0 +1,390 @@

+![demo](assets/dust3r.jpg)
+Official implementation of `DUSt3R: Geometric 3D Vision Made Easy`
+[[Project page](https://dust3r.europe.naverlabs.com/)], [[DUSt3R arxiv](https://arxiv.org/abs/2312.14132)]
+> **Make sure to also check [MASt3R](https://github.com/naver/mast3r): Our new model with a local feature head, metric pointmaps, and a more scalable global alignment!**
+![Example of reconstruction from two images](assets/pipeline1.jpg)
+![High level overview of DUSt3R capabilities](assets/dust3r_archi.jpg)
+```bibtex
+@inproceedings{dust3r_cvpr24,
+      title={DUSt3R: Geometric 3D Vision Made Easy},
+      author={Shuzhe Wang and Vincent Leroy and Yohann Cabon and Boris Chidlovskii and Jerome Revaud},
+      booktitle = {CVPR},
+      year = {2024}
+}
+@misc{dust3r_arxiv23,
+      title={DUSt3R: Geometric 3D Vision Made Easy},
+      author={Shuzhe Wang and Vincent Leroy and Yohann Cabon and Boris Chidlovskii and Jerome Revaud},
+      year={2023},
+      eprint={2312.14132},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+## Table of Contents
+- [Table of Contents](#table-of-contents)
+- [License](#license)
+- [Get Started](#get-started)
+  - [Installation](#installation)
+  - [Checkpoints](#checkpoints)
+  - [Interactive demo](#interactive-demo)
+  - [Interactive demo with docker](#interactive-demo-with-docker)
+- [Usage](#usage)
+- [Training](#training)
+  - [Datasets](#datasets)
+  - [Demo](#demo)
+  - [Our Hyperparameters](#our-hyperparameters)
+## License
+The code is distributed under the CC BY-NC-SA 4.0 License.
+See [LICENSE](LICENSE) for more information.
+```python
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+```
+## Get Started
+### Installation
+1. Clone DUSt3R.
+```bash
+git clone --recursive https://github.com/naver/dust3r
+cd dust3r
+# if you have already cloned dust3r:
+# git submodule update --init --recursive
+```
+2. Create the environment, here we show an example using conda.
+```bash
+conda create -n dust3r python=3.11 cmake=3.14.0
+conda activate dust3r
+conda install pytorch torchvision pytorch-cuda=12.1 -c pytorch -c nvidia  # use the correct version of cuda for your system
+pip install -r requirements.txt
+# Optional: you can also install additional packages to:
+# - add support for HEIC images
+# - add pyrender, used to render depthmap in some datasets preprocessing
+# - add required packages for visloc.py
+pip install -r requirements_optional.txt
+```
+3. Optional, compile the cuda kernels for RoPE (as in CroCo v2).
+```bash
+# DUST3R relies on RoPE positional embeddings for which you can compile some cuda kernels for faster runtime.
+cd croco/models/curope/
+python setup.py build_ext --inplace
+cd ../../../
+```
+### Checkpoints
+You can obtain the checkpoints by two ways:
+1) You can use our huggingface_hub integration: the models will be downloaded automatically.
+2) Otherwise, We provide several pre-trained models:
+| Modelname   | Training resolutions | Head | Encoder | Decoder |
+|-------------|----------------------|------|---------|---------|
+| [`DUSt3R_ViTLarge_BaseDecoder_224_linear.pth`](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/DUSt3R_ViTLarge_BaseDecoder_224_linear.pth) | 224x224 | Linear | ViT-L | ViT-B |
+| [`DUSt3R_ViTLarge_BaseDecoder_512_linear.pth`](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/DUSt3R_ViTLarge_BaseDecoder_512_linear.pth)   | 512x384, 512x336, 512x288, 512x256, 512x160 | Linear | ViT-L | ViT-B |
+| [`DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth`](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth) | 512x384, 512x336, 512x288, 512x256, 512x160 | DPT | ViT-L | ViT-B |
+You can check the hyperparameters we used to train these models in the [section: Our Hyperparameters](#our-hyperparameters)
+To download a specific model, for example `DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth`:
+```bash
+mkdir -p checkpoints/
+wget https://download.europe.naverlabs.com/ComputerVision/DUSt3R/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth -P checkpoints/
+```
+For the checkpoints, make sure to agree to the license of all the public training datasets and base checkpoints we used, in addition to CC-BY-NC-SA 4.0. Again, see [section: Our Hyperparameters](#our-hyperparameters) for details.
+### Interactive demo
+In this demo, you should be able run DUSt3R on your machine to reconstruct a scene.
+First select images that depicts the same scene.
+You can adjust the global alignment schedule and its number of iterations.
+> [!NOTE]
+> If you selected one or two images, the global alignment procedure will be skipped (mode=GlobalAlignerMode.PairViewer)
+Hit "Run" and wait.
+When the global alignment ends, the reconstruction appears.
+Use the slider "min_conf_thr" to show or remove low confidence areas.
+```bash
+python3 demo.py --model_name DUSt3R_ViTLarge_BaseDecoder_512_dpt
+# Use --weights to load a checkpoint from a local file, eg --weights checkpoints/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth
+# Use --image_size to select the correct resolution for the selected checkpoint. 512 (default) or 224
+# Use --local_network to make it accessible on the local network, or --server_name to specify the url manually
+# Use --server_port to change the port, by default it will search for an available port starting at 7860
+# Use --device to use a different device, by default it's "cuda"
+```
+### Interactive demo with docker
+To run DUSt3R using Docker, including with NVIDIA CUDA support, follow these instructions:
+1. **Install Docker**: If not already installed, download and install `docker` and `docker compose` from the [Docker website](https://www.docker.com/get-started).
+2. **Install NVIDIA Docker Toolkit**: For GPU support, install the NVIDIA Docker toolkit from the [Nvidia website](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html).
+3. **Build the Docker image and run it**: `cd` into the `./docker` directory and run the following commands:
+```bash
+cd docker
+bash run.sh --with-cuda --model_name="DUSt3R_ViTLarge_BaseDecoder_512_dpt"
+```
+Or if you want to run the demo without CUDA support, run the following command:
+```bash
+cd docker
+bash run.sh --model_name="DUSt3R_ViTLarge_BaseDecoder_512_dpt"
+```
+By default, `demo.py` is lanched with the option `--local_network`.
+Visit `http://localhost:7860/` to access the web UI (or replace `localhost` with the machine's name to access it from the network).
+`run.sh` will launch docker-compose using either the [docker-compose-cuda.yml](docker/docker-compose-cuda.yml) or [docker-compose-cpu.ym](docker/docker-compose-cpu.yml) config file, then it starts the demo using [entrypoint.sh](docker/files/entrypoint.sh).
+![demo](assets/demo.jpg)
+## Usage
+```python
+from dust3r.inference import inference
+from dust3r.model import AsymmetricCroCo3DStereo
+from dust3r.utils.image import load_images
+from dust3r.image_pairs import make_pairs
+from dust3r.cloud_opt import global_aligner, GlobalAlignerMode
+if __name__ == '__main__':
+    device = 'cuda'
+    batch_size = 1
+    schedule = 'cosine'
+    lr = 0.01
+    niter = 300
+    model_name = "naver/DUSt3R_ViTLarge_BaseDecoder_512_dpt"
+    # you can put the path to a local checkpoint in model_name if needed
+    model = AsymmetricCroCo3DStereo.from_pretrained(model_name).to(device)
+    # load_images can take a list of images or a directory
+    images = load_images(['croco/assets/Chateau1.png', 'croco/assets/Chateau2.png'], size=512)
+    pairs = make_pairs(images, scene_graph='complete', prefilter=None, symmetrize=True)
+    output = inference(pairs, model, device, batch_size=batch_size)
+    # at this stage, you have the raw dust3r predictions
+    view1, pred1 = output['view1'], output['pred1']
+    view2, pred2 = output['view2'], output['pred2']
+    # here, view1, pred1, view2, pred2 are dicts of lists of len(2)
+    #  -> because we symmetrize we have (im1, im2) and (im2, im1) pairs
+    # in each view you have:
+    # an integer image identifier: view1['idx'] and view2['idx']
+    # the img: view1['img'] and view2['img']
+    # the image shape: view1['true_shape'] and view2['true_shape']
+    # an instance string output by the dataloader: view1['instance'] and view2['instance']
+    # pred1 and pred2 contains the confidence values: pred1['conf'] and pred2['conf']
+    # pred1 contains 3D points for view1['img'] in view1['img'] space: pred1['pts3d']
+    # pred2 contains 3D points for view2['img'] in view1['img'] space: pred2['pts3d_in_other_view']
+    # next we'll use the global_aligner to align the predictions
+    # depending on your task, you may be fine with the raw output and not need it
+    # with only two input images, you could use GlobalAlignerMode.PairViewer: it would just convert the output
+    # if using GlobalAlignerMode.PairViewer, no need to run compute_global_alignment
+    scene = global_aligner(output, device=device, mode=GlobalAlignerMode.PointCloudOptimizer)
+    loss = scene.compute_global_alignment(init="mst", niter=niter, schedule=schedule, lr=lr)
+    # retrieve useful values from scene:
+    imgs = scene.imgs
+    focals = scene.get_focals()
+    poses = scene.get_im_poses()
+    pts3d = scene.get_pts3d()
+    confidence_masks = scene.get_masks()
+    # visualize reconstruction
+    scene.show()
+    # find 2D-2D matches between the two images
+    from dust3r.utils.geometry import find_reciprocal_matches, xy_grid
+    pts2d_list, pts3d_list = [], []
+    for i in range(2):
+        conf_i = confidence_masks[i].cpu().numpy()
+        pts2d_list.append(xy_grid(*imgs[i].shape[:2][::-1])[conf_i])  # imgs[i].shape[:2] = (H, W)
+        pts3d_list.append(pts3d[i].detach().cpu().numpy()[conf_i])
+    reciprocal_in_P2, nn2_in_P1, num_matches = find_reciprocal_matches(*pts3d_list)
+    print(f'found {num_matches} matches')
+    matches_im1 = pts2d_list[1][reciprocal_in_P2]
+    matches_im0 = pts2d_list[0][nn2_in_P1][reciprocal_in_P2]
+    # visualize a few matches
+    import numpy as np
+    from matplotlib import pyplot as pl
+    n_viz = 10
+    match_idx_to_viz = np.round(np.linspace(0, num_matches-1, n_viz)).astype(int)
+    viz_matches_im0, viz_matches_im1 = matches_im0[match_idx_to_viz], matches_im1[match_idx_to_viz]
+    H0, W0, H1, W1 = *imgs[0].shape[:2], *imgs[1].shape[:2]
+    img0 = np.pad(imgs[0], ((0, max(H1 - H0, 0)), (0, 0), (0, 0)), 'constant', constant_values=0)
+    img1 = np.pad(imgs[1], ((0, max(H0 - H1, 0)), (0, 0), (0, 0)), 'constant', constant_values=0)
+    img = np.concatenate((img0, img1), axis=1)
+    pl.figure()
+    pl.imshow(img)
+    cmap = pl.get_cmap('jet')
+    for i in range(n_viz):
+        (x0, y0), (x1, y1) = viz_matches_im0[i].T, viz_matches_im1[i].T
+        pl.plot([x0, x1 + W0], [y0, y1], '-+', color=cmap(i / (n_viz - 1)), scalex=False, scaley=False)
+    pl.show(block=True)
+```
+![matching example on croco pair](assets/matching.jpg)
+## Training
+In this section, we present a short demonstration to get started with training DUSt3R.
+### Datasets
+At this moment, we have added the following training datasets:
+  - [CO3Dv2](https://github.com/facebookresearch/co3d) - [Creative Commons Attribution-NonCommercial 4.0 International](https://github.com/facebookresearch/co3d/blob/main/LICENSE)
+  - [ARKitScenes](https://github.com/apple/ARKitScenes) - [Creative Commons Attribution-NonCommercial-ShareAlike 4.0](https://github.com/apple/ARKitScenes/tree/main?tab=readme-ov-file#license)
+  - [ScanNet++](https://kaldir.vc.in.tum.de/scannetpp/) - [non-commercial research and educational purposes](https://kaldir.vc.in.tum.de/scannetpp/static/scannetpp-terms-of-use.pdf)
+  - [BlendedMVS](https://github.com/YoYo000/BlendedMVS) - [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/)
+  - [WayMo Open dataset](https://github.com/waymo-research/waymo-open-dataset) - [Non-Commercial Use](https://waymo.com/open/terms/)
+  - [Habitat-Sim](https://github.com/facebookresearch/habitat-sim/blob/main/DATASETS.md)
+  - [MegaDepth](https://www.cs.cornell.edu/projects/megadepth/)
+  - [StaticThings3D](https://github.com/lmb-freiburg/robustmvd/blob/master/rmvd/data/README.md#staticthings3d)
+  - [WildRGB-D](https://github.com/wildrgbd/wildrgbd/)
+For each dataset, we provide a preprocessing script in the `datasets_preprocess` directory and an archive containing the list of pairs when needed.
+You have to download the datasets yourself from their official sources, agree to their license, download our list of pairs, and run the preprocessing script.
+Links:
+[ARKitScenes pairs](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/arkitscenes_pairs.zip)
+[ScanNet++ pairs](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/scannetpp_pairs.zip)
+[BlendedMVS pairs](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/blendedmvs_pairs.npy)
+[WayMo Open dataset pairs](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/waymo_pairs.npz)
+[Habitat metadata](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/habitat_5views_v1_512x512_metadata.tar.gz)
+[MegaDepth pairs](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/megadepth_pairs.npz)
+[StaticThings3D pairs](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/staticthings_pairs.npy)
+> [!NOTE]
+> They are not strictly equivalent to what was used to train DUSt3R, but they should be close enough.
+### Demo
+For this training demo, we're going to download and prepare a subset of [CO3Dv2](https://github.com/facebookresearch/co3d) - [Creative Commons Attribution-NonCommercial 4.0 International](https://github.com/facebookresearch/co3d/blob/main/LICENSE) and launch the training code on it.
+The demo model will be trained for a few epochs on a very small dataset.
+It will not be very good.
+```bash
+# download and prepare the co3d subset
+mkdir -p data/co3d_subset
+cd data/co3d_subset
+git clone https://github.com/facebookresearch/co3d
+cd co3d
+python3 ./co3d/download_dataset.py --download_folder ../ --single_sequence_subset
+rm ../*.zip
+cd ../../..
+python3 datasets_preprocess/preprocess_co3d.py --co3d_dir data/co3d_subset --output_dir data/co3d_subset_processed  --single_sequence_subset
+# download the pretrained croco v2 checkpoint
+mkdir -p checkpoints/
+wget https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTLarge_BaseDecoder.pth -P checkpoints/
+# the training of dust3r is done in 3 steps.
+# for this example we'll do fewer epochs, for the actual hyperparameters we used in the paper, see the next section: "Our Hyperparameters"
+# step 1 - train dust3r for 224 resolution
+torchrun --nproc_per_node=4 train.py \
+    --train_dataset "1000 @ Co3d(split='train', ROOT='data/co3d_subset_processed', aug_crop=16, mask_bg='rand', resolution=224, transform=ColorJitter)" \
+    --test_dataset "100 @ Co3d(split='test', ROOT='data/co3d_subset_processed', resolution=224, seed=777)" \
+    --model "AsymmetricCroCo3DStereo(pos_embed='RoPE100', img_size=(224, 224), head_type='linear', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12)" \
+    --train_criterion "ConfLoss(Regr3D(L21, norm_mode='avg_dis'), alpha=0.2)" \
+    --test_criterion "Regr3D_ScaleShiftInv(L21, gt_scale=True)" \
+    --pretrained "checkpoints/CroCo_V2_ViTLarge_BaseDecoder.pth" \
+    --lr 0.0001 --min_lr 1e-06 --warmup_epochs 1 --epochs 10 --batch_size 16 --accum_iter 1 \
+    --save_freq 1 --keep_freq 5 --eval_freq 1 \
+    --output_dir "checkpoints/dust3r_demo_224"
+# step 2 - train dust3r for 512 resolution
+torchrun --nproc_per_node=4 train.py \
+    --train_dataset "1000 @ Co3d(split='train', ROOT='data/co3d_subset_processed', aug_crop=16, mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter)" \
+    --test_dataset "100 @ Co3d(split='test', ROOT='data/co3d_subset_processed', resolution=(512,384), seed=777)" \
+    --model "AsymmetricCroCo3DStereo(pos_embed='RoPE100', patch_embed_cls='ManyAR_PatchEmbed', img_size=(512, 512), head_type='linear', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12)" \
+    --train_criterion "ConfLoss(Regr3D(L21, norm_mode='avg_dis'), alpha=0.2)" \
+    --test_criterion "Regr3D_ScaleShiftInv(L21, gt_scale=True)" \
+    --pretrained "checkpoints/dust3r_demo_224/checkpoint-best.pth" \
+    --lr 0.0001 --min_lr 1e-06 --warmup_epochs 1 --epochs 10 --batch_size 4 --accum_iter 4 \
+    --save_freq 1 --keep_freq 5 --eval_freq 1 \
+    --output_dir "checkpoints/dust3r_demo_512"
+# step 3 - train dust3r for 512 resolution with dpt
+torchrun --nproc_per_node=4 train.py \
+    --train_dataset "1000 @ Co3d(split='train', ROOT='data/co3d_subset_processed', aug_crop=16, mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter)" \
+    --test_dataset "100 @ Co3d(split='test', ROOT='data/co3d_subset_processed', resolution=(512,384), seed=777)" \
+    --model "AsymmetricCroCo3DStereo(pos_embed='RoPE100', patch_embed_cls='ManyAR_PatchEmbed', img_size=(512, 512), head_type='dpt', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12)" \
+    --train_criterion "ConfLoss(Regr3D(L21, norm_mode='avg_dis'), alpha=0.2)" \
+    --test_criterion "Regr3D_ScaleShiftInv(L21, gt_scale=True)" \
+    --pretrained "checkpoints/dust3r_demo_512/checkpoint-best.pth" \
+    --lr 0.0001 --min_lr 1e-06 --warmup_epochs 1 --epochs 10 --batch_size 2 --accum_iter 8 \
+    --save_freq 1 --keep_freq 5 --eval_freq 1 --disable_cudnn_benchmark \
+    --output_dir "checkpoints/dust3r_demo_512dpt"
+```
+### Our Hyperparameters
+Here are the commands we used for training the models:
+```bash
+# NOTE: ROOT path omitted for datasets
+# 224 linear
+torchrun --nproc_per_node 8 train.py \
+    --train_dataset=" + 100_000 @ Habitat(1_000_000, split='train', aug_crop=16, resolution=224, transform=ColorJitter) + 100_000 @ BlendedMVS(split='train', aug_crop=16, resolution=224, transform=ColorJitter) + 100_000 @ MegaDepth(split='train', aug_crop=16, resolution=224, transform=ColorJitter) + 100_000 @ ARKitScenes(aug_crop=256, resolution=224, transform=ColorJitter) + 100_000 @ Co3d(split='train', aug_crop=16, mask_bg='rand', resolution=224, transform=ColorJitter) + 100_000 @ StaticThings3D(aug_crop=256, mask_bg='rand', resolution=224, transform=ColorJitter) + 100_000 @ ScanNetpp(split='train', aug_crop=256, resolution=224, transform=ColorJitter) + 100_000 @ InternalUnreleasedDataset(aug_crop=128, resolution=224, transform=ColorJitter) " \
+    --test_dataset=" Habitat(1_000, split='val', resolution=224, seed=777) + 1_000 @ BlendedMVS(split='val', resolution=224, seed=777) + 1_000 @ MegaDepth(split='val', resolution=224, seed=777) + 1_000 @ Co3d(split='test', mask_bg='rand', resolution=224, seed=777) " \
+    --train_criterion="ConfLoss(Regr3D(L21, norm_mode='avg_dis'), alpha=0.2)" \
+    --test_criterion="Regr3D_ScaleShiftInv(L21, gt_scale=True)" \
+    --model="AsymmetricCroCo3DStereo(pos_embed='RoPE100', img_size=(224, 224), head_type='linear', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12)" \
+    --pretrained="checkpoints/CroCo_V2_ViTLarge_BaseDecoder.pth" \
+    --lr=0.0001 --min_lr=1e-06 --warmup_epochs=10 --epochs=100 --batch_size=16 --accum_iter=1 \
+    --save_freq=5 --keep_freq=10 --eval_freq=1 \
+    --output_dir="checkpoints/dust3r_224"
+# 512 linear
+torchrun --nproc_per_node 8 train.py \
+    --train_dataset=" + 10_000 @ Habitat(1_000_000, split='train', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ BlendedMVS(split='train', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ MegaDepth(split='train', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ ARKitScenes(aug_crop=256, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ Co3d(split='train', aug_crop=16, mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ StaticThings3D(aug_crop=256, mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ ScanNetpp(split='train', aug_crop=256, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ InternalUnreleasedDataset(aug_crop=128, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) " \
+    --test_dataset=" Habitat(1_000, split='val', resolution=(512,384), seed=777) + 1_000 @ BlendedMVS(split='val', resolution=(512,384), seed=777) + 1_000 @ MegaDepth(split='val', resolution=(512,336), seed=777) + 1_000 @ Co3d(split='test', resolution=(512,384), seed=777) " \
+    --train_criterion="ConfLoss(Regr3D(L21, norm_mode='avg_dis'), alpha=0.2)" \
+    --test_criterion="Regr3D_ScaleShiftInv(L21, gt_scale=True)" \
+    --model="AsymmetricCroCo3DStereo(pos_embed='RoPE100', patch_embed_cls='ManyAR_PatchEmbed', img_size=(512, 512), head_type='linear', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12)" \
+    --pretrained="checkpoints/dust3r_224/checkpoint-best.pth" \
+    --lr=0.0001 --min_lr=1e-06 --warmup_epochs=20 --epochs=100 --batch_size=4 --accum_iter=2 \
+    --save_freq=10 --keep_freq=10 --eval_freq=1 --print_freq=10 \
+    --output_dir="checkpoints/dust3r_512"
+# 512 dpt
+torchrun --nproc_per_node 8 train.py \
+    --train_dataset=" + 10_000 @ Habitat(1_000_000, split='train', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ BlendedMVS(split='train', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ MegaDepth(split='train', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ ARKitScenes(aug_crop=256, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ Co3d(split='train', aug_crop=16, mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ StaticThings3D(aug_crop=256, mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ ScanNetpp(split='train', aug_crop=256, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ InternalUnreleasedDataset(aug_crop=128, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) " \
+    --test_dataset=" Habitat(1_000, split='val', resolution=(512,384), seed=777) + 1_000 @ BlendedMVS(split='val', resolution=(512,384), seed=777) + 1_000 @ MegaDepth(split='val', resolution=(512,336), seed=777) + 1_000 @ Co3d(split='test', resolution=(512,384), seed=777) " \
+    --train_criterion="ConfLoss(Regr3D(L21, norm_mode='avg_dis'), alpha=0.2)" \
+    --test_criterion="Regr3D_ScaleShiftInv(L21, gt_scale=True)" \
+    --model="AsymmetricCroCo3DStereo(pos_embed='RoPE100', patch_embed_cls='ManyAR_PatchEmbed', img_size=(512, 512), head_type='dpt', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12)" \
+    --pretrained="checkpoints/dust3r_512/checkpoint-best.pth" \
+    --lr=0.0001 --min_lr=1e-06 --warmup_epochs=15 --epochs=90 --batch_size=4 --accum_iter=2 \
+    --save_freq=5 --keep_freq=10 --eval_freq=1 --print_freq=10 --disable_cudnn_benchmark \
+    --output_dir="checkpoints/dust3r_512dpt"
+```

dust3r/assets/demo.jpg ADDED Viewed

Git LFS Details

SHA256: 957a892f9033fb3e733546a202e3c07e362618c708eacf050979d4c4edd5435f
Pointer size: 131 Bytes
Size of remote file: 340 kB

dust3r/assets/dust3r.jpg ADDED Viewed

dust3r/assets/dust3r_archi.jpg ADDED Viewed

dust3r/assets/matching.jpg ADDED Viewed

Git LFS Details

SHA256: ecfe07fd00505045a155902c5686cc23060782a8b020f7596829fb60584a79ee
Pointer size: 131 Bytes
Size of remote file: 159 kB

dust3r/assets/pipeline1.jpg ADDED Viewed

dust3r/croco/LICENSE ADDED Viewed

	@@ -0,0 +1,52 @@

+CroCo, Copyright (c) 2022-present Naver Corporation, is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 license.
+A summary of the CC BY-NC-SA 4.0 license is located here:
+	https://creativecommons.org/licenses/by-nc-sa/4.0/
+The CC BY-NC-SA 4.0 license is located here:
+	https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+SEE NOTICE BELOW WITH RESPECT TO THE FILE: models/pos_embed.py, models/blocks.py
+***************************
+NOTICE WITH RESPECT TO THE FILE: models/pos_embed.py
+This software is being redistributed in a modifiled form. The original form is available here:
+https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+This software in this file incorporates parts of the following software available here:
+Transformer: https://github.com/tensorflow/models/blob/master/official/legacy/transformer/model_utils.py
+available under the following license: https://github.com/tensorflow/models/blob/master/LICENSE
+MoCo v3: https://github.com/facebookresearch/moco-v3
+available under the following license: https://github.com/facebookresearch/moco-v3/blob/main/LICENSE
+DeiT: https://github.com/facebookresearch/deit
+available under the following license: https://github.com/facebookresearch/deit/blob/main/LICENSE
+ORIGINAL COPYRIGHT NOTICE AND PERMISSION NOTICE AVAILABLE HERE IS REPRODUCE BELOW:
+https://github.com/facebookresearch/mae/blob/main/LICENSE
+Attribution-NonCommercial 4.0 International
+***************************
+NOTICE WITH RESPECT TO THE FILE: models/blocks.py
+This software is being redistributed in a modifiled form. The original form is available here:
+https://github.com/rwightman/pytorch-image-models
+ORIGINAL COPYRIGHT NOTICE AND PERMISSION NOTICE AVAILABLE HERE IS REPRODUCE BELOW:
+https://github.com/rwightman/pytorch-image-models/blob/master/LICENSE
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/

dust3r/croco/NOTICE ADDED Viewed

	@@ -0,0 +1,21 @@

+CroCo
+Copyright 2022-present NAVER Corp.
+This project contains subcomponents with separate copyright notices and license terms.
+Your use of the source code for these subcomponents is subject to the terms and conditions of the following licenses.
+====
+facebookresearch/mae
+https://github.com/facebookresearch/mae
+Attribution-NonCommercial 4.0 International
+====
+rwightman/pytorch-image-models
+https://github.com/rwightman/pytorch-image-models
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/

dust3r/croco/README.MD ADDED Viewed

	@@ -0,0 +1,124 @@

+# CroCo + CroCo v2 / CroCo-Stereo / CroCo-Flow
+[[`CroCo arXiv`](https://arxiv.org/abs/2210.10716)] [[`CroCo v2 arXiv`](https://arxiv.org/abs/2211.10408)] [[`project page and demo`](https://croco.europe.naverlabs.com/)]
+This repository contains the code for our CroCo model presented in our NeurIPS'22 paper [CroCo: Self-Supervised Pre-training for 3D Vision Tasks by Cross-View Completion](https://openreview.net/pdf?id=wZEfHUM5ri) and its follow-up extension published at ICCV'23 [Improved Cross-view Completion Pre-training for Stereo Matching and Optical Flow](https://openaccess.thecvf.com/content/ICCV2023/html/Weinzaepfel_CroCo_v2_Improved_Cross-view_Completion_Pre-training_for_Stereo_Matching_and_ICCV_2023_paper.html), refered to as CroCo v2:
+![image](assets/arch.jpg)
+```bibtex
+@inproceedings{croco,
+  title={{CroCo: Self-Supervised Pre-training for 3D Vision Tasks by Cross-View Completion}},
+  author={{Weinzaepfel, Philippe and Leroy, Vincent and Lucas, Thomas and Br\'egier, Romain and Cabon, Yohann and Arora, Vaibhav and Antsfeld, Leonid and Chidlovskii, Boris and Csurka, Gabriela and Revaud J\'er\^ome}},
+  booktitle={{NeurIPS}},
+  year={2022}
+}
+@inproceedings{croco_v2,
+  title={{CroCo v2: Improved Cross-view Completion Pre-training for Stereo Matching and Optical Flow}},
+  author={Weinzaepfel, Philippe and Lucas, Thomas and Leroy, Vincent and Cabon, Yohann and Arora, Vaibhav and Br{\'e}gier, Romain and Csurka, Gabriela and Antsfeld, Leonid and Chidlovskii, Boris and Revaud, J{\'e}r{\^o}me},
+  booktitle={ICCV},
+  year={2023}
+}
+```
+## License
+The code is distributed under the CC BY-NC-SA 4.0 License. See [LICENSE](LICENSE) for more information.
+Some components are based on code from [MAE](https://github.com/facebookresearch/mae) released under the CC BY-NC-SA 4.0 License and [timm](https://github.com/rwightman/pytorch-image-models) released under the Apache 2.0 License.
+Some components for stereo matching and optical flow are based on code from [unimatch](https://github.com/autonomousvision/unimatch) released under the MIT license.
+## Preparation
+1. Install dependencies on a machine with a NVidia GPU using e.g. conda. Note that `habitat-sim` is required only for the interactive demo and the synthetic pre-training data generation. If you don't plan to use it, you can ignore the line installing it and use a more recent python version.
+```bash
+conda create -n croco python=3.7 cmake=3.14.0
+conda activate croco
+conda install habitat-sim headless -c conda-forge -c aihabitat
+conda install pytorch torchvision -c pytorch
+conda install notebook ipykernel matplotlib
+conda install ipywidgets widgetsnbextension
+conda install scikit-learn tqdm quaternion opencv # only for pretraining / habitat data generation
+```
+2. Compile cuda kernels for RoPE
+CroCo v2 relies on RoPE positional embeddings for which you need to compile some cuda kernels.
+```bash
+cd models/curope/
+python setup.py build_ext --inplace
+cd ../../
+```
+This can be a bit long as we compile for all cuda architectures, feel free to update L9 of `models/curope/setup.py` to compile for specific architectures only.
+You might also need to set the environment `CUDA_HOME` in case you use a custom cuda installation.
+In case you cannot provide, we also provide a slow pytorch version, which will be automatically loaded.
+3. Download pre-trained model
+We provide several pre-trained models:
+| modelname                                                                                                                          | pre-training data | pos. embed. | Encoder | Decoder |
+|------------------------------------------------------------------------------------------------------------------------------------|-------------------|-------------|---------|---------|
+| [`CroCo.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo.pth)                                                 | Habitat           | cosine      | ViT-B   | Small   |
+| [`CroCo_V2_ViTBase_SmallDecoder.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTBase_SmallDecoder.pth) | Habitat + real    | RoPE        | ViT-B   | Small   |
+| [`CroCo_V2_ViTBase_BaseDecoder.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTBase_BaseDecoder.pth)   | Habitat + real    | RoPE        | ViT-B   | Base    |
+| [`CroCo_V2_ViTLarge_BaseDecoder.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTLarge_BaseDecoder.pth) | Habitat + real    | RoPE        | ViT-L   | Base    |
+To download a specific model, i.e., the first one (`CroCo.pth`)
+```bash
+mkdir -p pretrained_models/
+wget https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo.pth -P pretrained_models/
+```
+## Reconstruction example
+Simply run after downloading the `CroCo_V2_ViTLarge_BaseDecoder` pretrained model (or update the corresponding line in `demo.py`)
+```bash
+python demo.py
+```
+## Interactive demonstration of cross-view completion reconstruction on the Habitat simulator
+First download the test scene from Habitat:
+```bash
+python -m habitat_sim.utils.datasets_download --uids habitat_test_scenes --data-path habitat-sim-data/
+```
+Then, run the Notebook demo `interactive_demo.ipynb`.
+In this demo, you should be able to sample a random reference viewpoint from an [Habitat](https://github.com/facebookresearch/habitat-sim) test scene. Use the sliders to change viewpoint and select a masked target view to reconstruct using CroCo.
+![croco_interactive_demo](https://user-images.githubusercontent.com/1822210/200516576-7937bc6a-55f8-49ed-8618-3ddf89433ea4.jpg)
+## Pre-training
+### CroCo
+To pre-train CroCo, please first generate the pre-training data from the Habitat simulator, following the instructions in [datasets/habitat_sim/README.MD](datasets/habitat_sim/README.MD) and then run the following command:
+```
+torchrun --nproc_per_node=4 pretrain.py --output_dir ./output/pretraining/
+```
+Our CroCo pre-training was launched on a single server with 4 GPUs.
+It should take around 10 days with A100 or 15 days with V100 to do the 400 pre-training epochs, but decent performances are obtained earlier in training.
+Note that, while the code contains the same scaling rule of the learning rate as MAE when changing the effective batch size, we did not experimented if it is valid in our case.
+The first run can take a few minutes to start, to parse all available pre-training pairs.
+### CroCo v2
+For CroCo v2 pre-training, in addition to the generation of the pre-training data from the Habitat simulator above, please pre-extract the crops from the real datasets following the instructions in [datasets/crops/README.MD](datasets/crops/README.MD).
+Then, run the following command for the largest model (ViT-L encoder, Base decoder):
+```
+torchrun --nproc_per_node=8 pretrain.py --model "CroCoNet(enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_num_heads=12, dec_depth=12, pos_embed='RoPE100')" --dataset "habitat_release+ARKitScenes+MegaDepth+3DStreetView+IndoorVL" --warmup_epochs 12 --max_epoch 125 --epochs 250 --amp 0 --keep_freq 5 --output_dir ./output/pretraining_crocov2/
+```
+Our CroCo v2 pre-training was launched on a single server with 8 GPUs for the largest model, and on a single server with 4 GPUs for the smaller ones, keeping a batch size of 64 per gpu in all cases.
+The largest model should take around 12 days on A100.
+Note that, while the code contains the same scaling rule of the learning rate as MAE when changing the effective batch size, we did not experimented if it is valid in our case.
+## Stereo matching and Optical flow downstream tasks
+For CroCo-Stereo and CroCo-Flow, please refer to [stereoflow/README.MD](stereoflow/README.MD).

dust3r/croco/assets/Chateau1.png ADDED Viewed

Git LFS Details

SHA256: 71ffb8c7d77e5ced0bb3dcd2cb0db84d0e98e6ff5ffd2d02696a7156e5284857
Pointer size: 131 Bytes
Size of remote file: 112 kB

dust3r/croco/assets/Chateau2.png ADDED Viewed

Git LFS Details

SHA256: c3a0be9e19f6b89491d692c71e3f2317c2288a898a990561d48b7667218b47c8
Pointer size: 131 Bytes
Size of remote file: 110 kB

dust3r/croco/assets/arch.jpg ADDED Viewed

dust3r/croco/croco-stereo-flow-demo.ipynb ADDED Viewed

	@@ -0,0 +1,191 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "9bca0f41",
+   "metadata": {},
+   "source": [
+    "# Simple inference example with CroCo-Stereo or CroCo-Flow"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "80653ef7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Copyright (C) 2022-present Naver Corporation. All rights reserved.\n",
+    "# Licensed under CC BY-NC-SA 4.0 (non-commercial use only)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4f033862",
+   "metadata": {},
+   "source": [
+    "First download the model(s) of your choice by running\n",
+    "```\n",
+    "bash stereoflow/download_model.sh crocostereo.pth\n",
+    "bash stereoflow/download_model.sh crocoflow.pth\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1fb2e392",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "use_gpu = torch.cuda.is_available() and torch.cuda.device_count()>0\n",
+    "device = torch.device('cuda:0' if use_gpu else 'cpu')\n",
+    "import matplotlib.pylab as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e0e25d77",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from stereoflow.test import _load_model_and_criterion\n",
+    "from stereoflow.engine import tiled_pred\n",
+    "from stereoflow.datasets_stereo import img_to_tensor, vis_disparity\n",
+    "from stereoflow.datasets_flow import flowToColor\n",
+    "tile_overlap=0.7 # recommended value, higher value can be slightly better but slower"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "86a921f5",
+   "metadata": {},
+   "source": [
+    "### CroCo-Stereo example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "64e483cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image1 = np.asarray(Image.open('<path_to_left_image>'))\n",
+    "image2 = np.asarray(Image.open('<path_to_right_image>'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f0d04303",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model, _, cropsize, with_conf, task, tile_conf_mode = _load_model_and_criterion('stereoflow_models/crocostereo.pth', None, device)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "47dc14b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "im1 = img_to_tensor(image1).to(device).unsqueeze(0)\n",
+    "im2 = img_to_tensor(image2).to(device).unsqueeze(0)\n",
+    "with torch.inference_mode():\n",
+    "    pred, _, _ = tiled_pred(model, None, im1, im2, None, conf_mode=tile_conf_mode, overlap=tile_overlap, crop=cropsize, with_conf=with_conf, return_time=False)\n",
+    "pred = pred.squeeze(0).squeeze(0).cpu().numpy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "583b9f16",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.imshow(vis_disparity(pred))\n",
+    "plt.axis('off')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d2df5d70",
+   "metadata": {},
+   "source": [
+    "### CroCo-Flow example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ee257a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image1 = np.asarray(Image.open('<path_to_first_image>'))\n",
+    "image2 = np.asarray(Image.open('<path_to_second_image>'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5edccf0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model, _, cropsize, with_conf, task, tile_conf_mode = _load_model_and_criterion('stereoflow_models/crocoflow.pth', None, device)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b19692c3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "im1 = img_to_tensor(image1).to(device).unsqueeze(0)\n",
+    "im2 = img_to_tensor(image2).to(device).unsqueeze(0)\n",
+    "with torch.inference_mode():\n",
+    "    pred, _, _ = tiled_pred(model, None, im1, im2, None, conf_mode=tile_conf_mode, overlap=tile_overlap, crop=cropsize, with_conf=with_conf, return_time=False)\n",
+    "pred = pred.squeeze(0).permute(1,2,0).cpu().numpy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "26f79db3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.imshow(flowToColor(pred))\n",
+    "plt.axis('off')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

dust3r/croco/datasets_croco/__init__.py ADDED Viewed

File without changes

dust3r/croco/datasets_croco/crops/README.MD ADDED Viewed

	@@ -0,0 +1,104 @@

+## Generation of crops from the real datasets
+The instructions below allow to generate the crops used for pre-training CroCo v2 from the following real-world datasets: ARKitScenes, MegaDepth, 3DStreetView and IndoorVL.
+### Download the metadata of the crops to generate
+First, download the metadata and put them in `./data/`:
+```
+mkdir -p data
+cd data/
+wget https://download.europe.naverlabs.com/ComputerVision/CroCo/data/crop_metadata.zip
+unzip crop_metadata.zip
+rm crop_metadata.zip
+cd ..
+```
+### Prepare the original datasets
+Second, download the original datasets in `./data/original_datasets/`.
+```
+mkdir -p data/original_datasets
+```
+##### ARKitScenes
+Download the `raw` dataset from https://github.com/apple/ARKitScenes/blob/main/DATA.md and put it in `./data/original_datasets/ARKitScenes/`.
+The resulting file structure should be like:
+```
+./data/original_datasets/ARKitScenes/
+└───Training
+    └───40753679
+     │  │   ultrawide
+     │  │   ...
+     └───40753686
+     │
+      ...
+```
+##### MegaDepth
+Download `MegaDepth v1 Dataset` from https://www.cs.cornell.edu/projects/megadepth/ and put it in `./data/original_datasets/MegaDepth/`.
+The resulting file structure should be like:
+```
+./data/original_datasets/MegaDepth/
+└───0000
+│   └───images
+│    │      │   1000557903_87fa96b8a4_o.jpg
+│    │      └ ...
+│    └─── ...
+└───0001
+│   │
+│   └ ...
+└─── ...
+```
+##### 3DStreetView
+Download `3D_Street_View` dataset from https://github.com/amir32002/3D_Street_View and put it in `./data/original_datasets/3DStreetView/`.
+The resulting file structure should be like:
+```
+./data/original_datasets/3DStreetView/
+└───dataset_aligned
+│   └───0002
+│    │      │   0000002_0000001_0000002_0000001.jpg
+│    │      └ ...
+│    └─── ...
+└───dataset_unaligned
+│   └───0003
+│    │      │   0000003_0000001_0000002_0000001.jpg
+│    │      └ ...
+│    └─── ...
+```
+##### IndoorVL
+Download the `IndoorVL` datasets using [Kapture](https://github.com/naver/kapture).
+```
+pip install kapture
+mkdir -p ./data/original_datasets/IndoorVL
+cd ./data/original_datasets/IndoorVL
+kapture_download_dataset.py update
+kapture_download_dataset.py install  "HyundaiDepartmentStore_*"
+kapture_download_dataset.py install  "GangnamStation_*"
+cd -
+```
+### Extract the crops
+Now, extract the crops for each of the dataset:
+```
+for dataset in ARKitScenes MegaDepth 3DStreetView IndoorVL;
+do
+  python3 datasets/crops/extract_crops_from_images.py --crops ./data/crop_metadata/${dataset}/crops_release.txt --root-dir ./data/original_datasets/${dataset}/ --output-dir ./data/${dataset}_crops/ --imsize 256 --nthread 8 --max-subdir-levels 5 --ideal-number-pairs-in-dir 500;
+done
+```
+##### Note for IndoorVL
+Due to some legal issues, we can only release 144,228 pairs out of the 1,593,689 pairs used in the paper.
+To account for it in terms of number of pre-training iterations, the pre-training command in this repository uses 125 training epochs including 12 warm-up epochs and learning rate cosine schedule of 250, instead of 100, 10 and 200 respectively.
+The impact on the performance is negligible.

dust3r/croco/datasets_croco/crops/extract_crops_from_images.py ADDED Viewed

	@@ -0,0 +1,159 @@

+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Extracting crops for pre-training
+# --------------------------------------------------------
+import os
+import argparse
+from tqdm import tqdm
+from PIL import Image
+import functools
+from multiprocessing import Pool
+import math
+def arg_parser():
+    parser = argparse.ArgumentParser('Generate cropped image pairs from image crop list')
+    parser.add_argument('--crops', type=str, required=True, help='crop file')
+    parser.add_argument('--root-dir', type=str, required=True, help='root directory')
+    parser.add_argument('--output-dir', type=str, required=True, help='output directory')
+    parser.add_argument('--imsize', type=int, default=256, help='size of the crops')
+    parser.add_argument('--nthread', type=int, required=True, help='number of simultaneous threads')
+    parser.add_argument('--max-subdir-levels', type=int, default=5, help='maximum number of subdirectories')
+    parser.add_argument('--ideal-number-pairs-in-dir', type=int, default=500, help='number of pairs stored in a dir')
+    return parser
+def main(args):
+    listing_path = os.path.join(args.output_dir, 'listing.txt')
+    print(f'Loading list of crops ... ({args.nthread} threads)')
+    crops, num_crops_to_generate = load_crop_file(args.crops)
+    print(f'Preparing jobs ({len(crops)} candidate image pairs)...')
+    num_levels = min(math.ceil(math.log(num_crops_to_generate, args.ideal_number_pairs_in_dir)), args.max_subdir_levels)
+    num_pairs_in_dir = math.ceil(num_crops_to_generate ** (1/num_levels))
+    jobs = prepare_jobs(crops, num_levels, num_pairs_in_dir)
+    del crops
+    os.makedirs(args.output_dir, exist_ok=True)
+    mmap = Pool(args.nthread).imap_unordered if args.nthread > 1 else map
+    call = functools.partial(save_image_crops, args)
+    print(f"Generating cropped images to {args.output_dir} ...")
+    with open(listing_path, 'w') as listing:
+        listing.write('# pair_path\n')
+        for results in tqdm(mmap(call, jobs), total=len(jobs)):
+            for path in results:
+                listing.write(f'{path}\n')
+    print('Finished writing listing to', listing_path)
+def load_crop_file(path):
+    data = open(path).read().splitlines()
+    pairs = []
+    num_crops_to_generate = 0
+    for line in tqdm(data):
+        if line.startswith('#'):
+            continue
+        line = line.split(', ')
+        if len(line) < 8:
+            img1, img2, rotation = line
+            pairs.append((img1, img2, int(rotation), []))
+        else:
+            l1, r1, t1, b1, l2, r2, t2, b2 = map(int, line)
+            rect1, rect2 = (l1, t1, r1, b1), (l2, t2, r2, b2)
+            pairs[-1][-1].append((rect1, rect2))
+            num_crops_to_generate += 1
+    return pairs, num_crops_to_generate
+def prepare_jobs(pairs, num_levels, num_pairs_in_dir):
+    jobs = []
+    powers = [num_pairs_in_dir**level for level in reversed(range(num_levels))]
+    def get_path(idx):
+        idx_array = []
+        d = idx
+        for level in range(num_levels - 1):
+            idx_array.append(idx // powers[level])
+            idx = idx % powers[level]
+        idx_array.append(d)
+        return '/'.join(map(lambda x: hex(x)[2:], idx_array))
+    idx = 0
+    for pair_data in tqdm(pairs):
+        img1, img2, rotation, crops = pair_data
+        if -60 <= rotation and rotation <= 60:
+            rotation = 0  # most likely not a true rotation
+        paths = [get_path(idx + k) for k in range(len(crops))]
+        idx += len(crops)
+        jobs.append(((img1, img2), rotation, crops, paths))
+    return jobs
+def load_image(path):
+    try:
+        return Image.open(path).convert('RGB')
+    except Exception as e:
+        print('skipping', path, e)
+        raise OSError()
+def save_image_crops(args, data):
+    # load images
+    img_pair, rot, crops, paths = data
+    try:
+        img1, img2 = [load_image(os.path.join(args.root_dir, impath)) for impath in img_pair]
+    except OSError as e:
+        return []
+    def area(sz):
+        return sz[0] * sz[1]
+    tgt_size = (args.imsize, args.imsize)
+    def prepare_crop(img, rect, rot=0):
+        # actual crop
+        img = img.crop(rect)
+        # resize to desired size
+        interp = Image.Resampling.LANCZOS if area(img.size) > 4*area(tgt_size) else Image.Resampling.BICUBIC
+        img = img.resize(tgt_size, resample=interp)
+        # rotate the image
+        rot90 = (round(rot/90) % 4) * 90
+        if rot90 == 90:
+            img = img.transpose(Image.Transpose.ROTATE_90)
+        elif rot90 == 180:
+            img = img.transpose(Image.Transpose.ROTATE_180)
+        elif rot90 == 270:
+            img = img.transpose(Image.Transpose.ROTATE_270)
+        return img
+    results = []
+    for (rect1, rect2), path in zip(crops, paths):
+        crop1 = prepare_crop(img1, rect1)
+        crop2 = prepare_crop(img2, rect2, rot)
+        fullpath1 = os.path.join(args.output_dir,  path+'_1.jpg')
+        fullpath2 = os.path.join(args.output_dir,  path+'_2.jpg')
+        os.makedirs(os.path.dirname(fullpath1), exist_ok=True)
+        assert not os.path.isfile(fullpath1), fullpath1
+        assert not os.path.isfile(fullpath2), fullpath2
+        crop1.save(fullpath1)
+        crop2.save(fullpath2)
+        results.append(path)
+    return results
+if __name__ == '__main__':
+    args = arg_parser().parse_args()
+    main(args)

dust3r/croco/datasets_croco/habitat_sim/README.MD ADDED Viewed

	@@ -0,0 +1,76 @@

+## Generation of synthetic image pairs using Habitat-Sim
+These instructions allow to generate pre-training pairs from the Habitat simulator.
+As we did not save metadata of the pairs used in the original paper, they are not strictly the same, but these data use the same setting and are equivalent.
+### Download Habitat-Sim scenes
+Download Habitat-Sim scenes:
+- Download links can be found here: https://github.com/facebookresearch/habitat-sim/blob/main/DATASETS.md
+- We used scenes from the HM3D, habitat-test-scenes, Replica, ReplicaCad and ScanNet datasets.
+- Please put the scenes under `./data/habitat-sim-data/scene_datasets/` following the structure below, or update manually paths in `paths.py`.
+```
+./data/
+└──habitat-sim-data/
+   └──scene_datasets/
+      ├──hm3d/
+      ├──gibson/
+      ├──habitat-test-scenes/
+      ├──replica_cad_baked_lighting/
+      ├──replica_cad/
+      ├──ReplicaDataset/
+      └──scannet/
+```
+### Image pairs generation
+We provide metadata to generate reproducible images pairs for pretraining and validation.
+Experiments described in the paper used similar data, but whose generation was not reproducible at the time.
+Specifications:
+- 256x256 resolution images, with 60 degrees field of view .
+- Up to 1000 image pairs per scene.
+- Number of scenes considered/number of images pairs per dataset:
+  - Scannet: 1097 scenes / 985 209 pairs
+  - HM3D:
+    - hm3d/train: 800 / 800k pairs
+    - hm3d/val: 100 scenes / 100k pairs
+    - hm3d/minival: 10 scenes / 10k pairs
+  - habitat-test-scenes: 3 scenes / 3k pairs
+  - replica_cad_baked_lighting: 13 scenes / 13k pairs
+- Scenes from hm3d/val and hm3d/minival pairs were not used for the pre-training but kept for validation purposes.
+Download metadata and extract it:
+```bash
+mkdir -p data/habitat_release_metadata/
+cd data/habitat_release_metadata/
+wget https://download.europe.naverlabs.com/ComputerVision/CroCo/data/habitat_release_metadata/multiview_habitat_metadata.tar.gz
+tar -xvf multiview_habitat_metadata.tar.gz
+cd ../..
+# Location of the metadata
+METADATA_DIR="./data/habitat_release_metadata/multiview_habitat_metadata"
+```
+Generate image pairs from metadata:
+- The following command will print a list of commandlines to generate image pairs for each scene:
+```bash
+# Target output directory
+PAIRS_DATASET_DIR="./data/habitat_release/"
+python datasets/habitat_sim/generate_from_metadata_files.py --input_dir=$METADATA_DIR --output_dir=$PAIRS_DATASET_DIR
+```
+- One can launch multiple of such commands in parallel e.g. using GNU Parallel:
+```bash
+python datasets/habitat_sim/generate_from_metadata_files.py --input_dir=$METADATA_DIR --output_dir=$PAIRS_DATASET_DIR | parallel -j 16
+```
+## Metadata generation
+Image pairs were randomly sampled using the following commands, whose outputs contain randomness and are thus not exactly reproducible:
+```bash
+# Print commandlines to generate image pairs from the different scenes available.
+PAIRS_DATASET_DIR=MY_CUSTOM_PATH
+python datasets/habitat_sim/generate_multiview_images.py --list_commands --output_dir=$PAIRS_DATASET_DIR
+# Once a dataset is generated, pack metadata files for reproducibility.
+METADATA_DIR=MY_CUSTON_PATH
+python datasets/habitat_sim/pack_metadata_files.py $PAIRS_DATASET_DIR  $METADATA_DIR
+```

dust3r/croco/datasets_croco/habitat_sim/__init__.py ADDED Viewed

File without changes

dust3r/croco/datasets_croco/habitat_sim/generate_from_metadata.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+"""
+Script to generate image pairs for a given scene reproducing poses provided in a metadata file.
+"""
+import os
+from datasets.habitat_sim.multiview_habitat_sim_generator import MultiviewHabitatSimGenerator
+from datasets.habitat_sim.paths import SCENES_DATASET
+import argparse
+import quaternion
+import PIL.Image
+import cv2
+import json
+from tqdm import tqdm
+def generate_multiview_images_from_metadata(metadata_filename,
+                                            output_dir,
+                                            overload_params = dict(),
+                                            scene_datasets_paths=None,
+                                            exist_ok=False):
+    """
+    Generate images from a metadata file for reproducibility purposes.
+    """
+    # Reorder paths by decreasing label length, to avoid collisions when testing if a string by such label
+    if scene_datasets_paths is not None:
+        scene_datasets_paths = dict(sorted(scene_datasets_paths.items(), key= lambda x: len(x[0]), reverse=True))
+    with open(metadata_filename, 'r') as f:
+        input_metadata = json.load(f)
+    metadata = dict()
+    for key, value in input_metadata.items():
+        # Optionally replace some paths
+        if key in ("scene_dataset_config_file", "scene", "navmesh") and value != "":
+            if scene_datasets_paths is not None:
+                for dataset_label, dataset_path in scene_datasets_paths.items():
+                    if value.startswith(dataset_label):
+                        value = os.path.normpath(os.path.join(dataset_path, os.path.relpath(value, dataset_label)))
+                        break
+        metadata[key] = value
+    # Overload some parameters
+    for key, value in overload_params.items():
+        metadata[key] = value
+    generation_entries = dict([(key, value) for key, value in metadata.items() if not (key in ('multiviews', 'output_dir', 'generate_depth'))])
+    generate_depth = metadata["generate_depth"]
+    os.makedirs(output_dir, exist_ok=exist_ok)
+    generator = MultiviewHabitatSimGenerator(**generation_entries)
+    # Generate views
+    for idx_label, data in tqdm(metadata['multiviews'].items()):
+        positions = data["positions"]
+        orientations = data["orientations"]
+        n = len(positions)
+        for oidx in range(n):
+            observation = generator.render_viewpoint(positions[oidx], quaternion.from_float_array(orientations[oidx]))
+            observation_label = f"{oidx + 1}" # Leonid is indexing starting from 1
+            # Color image saved using PIL
+            img = PIL.Image.fromarray(observation['color'][:,:,:3])
+            filename = os.path.join(output_dir, f"{idx_label}_{observation_label}.jpeg")
+            img.save(filename)
+            if generate_depth:
+                # Depth image as EXR file
+                filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_depth.exr")
+                cv2.imwrite(filename, observation['depth'], [cv2.IMWRITE_EXR_TYPE, cv2.IMWRITE_EXR_TYPE_HALF])
+                # Camera parameters
+                camera_params = dict([(key, observation[key].tolist()) for key in ("camera_intrinsics", "R_cam2world", "t_cam2world")])
+                filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_camera_params.json")
+                with open(filename, "w") as f:
+                    json.dump(camera_params, f)
+                # Save metadata
+    with open(os.path.join(output_dir, "metadata.json"), "w") as f:
+        json.dump(metadata, f)
+    generator.close()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--metadata_filename", required=True)
+    parser.add_argument("--output_dir", required=True)
+    args = parser.parse_args()
+    generate_multiview_images_from_metadata(metadata_filename=args.metadata_filename,
+                             output_dir=args.output_dir,
+                             scene_datasets_paths=SCENES_DATASET,
+                             overload_params=dict(),
+                             exist_ok=True)

dust3r/croco/datasets_croco/habitat_sim/generate_from_metadata_files.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+"""
+Script generating commandlines to generate image pairs from metadata files.
+"""
+import os
+import glob
+from tqdm import tqdm
+import argparse
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_dir", required=True)
+    parser.add_argument("--output_dir", required=True)
+    parser.add_argument("--prefix", default="", help="Commanline prefix, useful e.g. to setup environment.")
+    args = parser.parse_args()
+    input_metadata_filenames = glob.iglob(f"{args.input_dir}/**/metadata.json", recursive=True)
+    for metadata_filename in tqdm(input_metadata_filenames):
+        output_dir = os.path.join(args.output_dir, os.path.relpath(os.path.dirname(metadata_filename), args.input_dir))
+        # Do not process the scene if the metadata file already exists
+        if os.path.exists(os.path.join(output_dir, "metadata.json")):
+            continue
+        commandline = f"{args.prefix}python datasets/habitat_sim/generate_from_metadata.py --metadata_filename={metadata_filename} --output_dir={output_dir}"
+        print(commandline)

dust3r/croco/datasets_croco/habitat_sim/generate_multiview_images.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+import os
+from tqdm import tqdm
+import argparse
+import PIL.Image
+import numpy as np
+import json
+from datasets.habitat_sim.multiview_habitat_sim_generator import MultiviewHabitatSimGenerator, NoNaviguableSpaceError
+from datasets.habitat_sim.paths import list_scenes_available
+import cv2
+import quaternion
+import shutil
+def generate_multiview_images_for_scene(scene_dataset_config_file,
+                                        scene,
+                                        navmesh,
+                                        output_dir,
+                                        views_count,
+                                        size,
+                                        exist_ok=False,
+                                        generate_depth=False,
+                                        **kwargs):
+    """
+    Generate tuples of overlapping views for a given scene.
+    generate_depth: generate depth images and camera parameters.
+    """
+    if os.path.exists(output_dir) and not exist_ok:
+        print(f"Scene {scene}: data already generated. Ignoring generation.")
+        return
+    try:
+        print(f"Scene {scene}: {size} multiview acquisitions to generate...")
+        os.makedirs(output_dir, exist_ok=exist_ok)
+        metadata_filename = os.path.join(output_dir, "metadata.json")
+        metadata_template = dict(scene_dataset_config_file=scene_dataset_config_file,
+            scene=scene,
+            navmesh=navmesh,
+            views_count=views_count,
+            size=size,
+            generate_depth=generate_depth,
+            **kwargs)
+        metadata_template["multiviews"] = dict()
+        if os.path.exists(metadata_filename):
+            print("Metadata file already exists:", metadata_filename)
+            print("Loading already generated metadata file...")
+            with open(metadata_filename, "r") as f:
+                metadata = json.load(f)
+            for key in metadata_template.keys():
+                if key != "multiviews":
+                    assert metadata_template[key] == metadata[key], f"existing file is inconsistent with the input parameters:\nKey: {key}\nmetadata: {metadata[key]}\ntemplate: {metadata_template[key]}."
+        else:
+            print("No temporary file found. Starting generation from scratch...")
+            metadata = metadata_template
+        starting_id = len(metadata["multiviews"])
+        print(f"Starting generation from index {starting_id}/{size}...")
+        if starting_id >= size:
+            print("Generation already done.")
+            return
+        generator = MultiviewHabitatSimGenerator(scene_dataset_config_file=scene_dataset_config_file,
+                                                scene=scene,
+                                                navmesh=navmesh,
+                                                views_count = views_count,
+                                                size = size,
+                                                **kwargs)
+        for idx in tqdm(range(starting_id, size)):
+            # Generate / re-generate the observations
+            try:
+                data = generator[idx]
+                observations = data["observations"]
+                positions = data["positions"]
+                orientations = data["orientations"]
+                idx_label = f"{idx:08}"
+                for oidx, observation in enumerate(observations):
+                    observation_label = f"{oidx + 1}" # Leonid is indexing starting from 1
+                    # Color image saved using PIL
+                    img = PIL.Image.fromarray(observation['color'][:,:,:3])
+                    filename = os.path.join(output_dir, f"{idx_label}_{observation_label}.jpeg")
+                    img.save(filename)
+                    if generate_depth:
+                        # Depth image as EXR file
+                        filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_depth.exr")
+                        cv2.imwrite(filename, observation['depth'], [cv2.IMWRITE_EXR_TYPE, cv2.IMWRITE_EXR_TYPE_HALF])
+                        # Camera parameters
+                        camera_params = dict([(key, observation[key].tolist()) for key in ("camera_intrinsics", "R_cam2world", "t_cam2world")])
+                        filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_camera_params.json")
+                        with open(filename, "w") as f:
+                            json.dump(camera_params, f)
+                metadata["multiviews"][idx_label] = {"positions": positions.tolist(),
+                                                    "orientations": orientations.tolist(),
+                                                    "covisibility_ratios": data["covisibility_ratios"].tolist(),
+                                                    "valid_fractions": data["valid_fractions"].tolist(),
+                                                    "pairwise_visibility_ratios": data["pairwise_visibility_ratios"].tolist()}
+            except RecursionError:
+                print("Recursion error: unable to sample observations for this scene. We will stop there.")
+                break
+            # Regularly save a temporary metadata file, in case we need to restart the generation
+            if idx % 10 == 0:
+                with open(metadata_filename, "w") as f:
+                    json.dump(metadata, f)
+        # Save metadata
+        with open(metadata_filename, "w") as f:
+            json.dump(metadata, f)
+        generator.close()
+    except NoNaviguableSpaceError:
+        pass
+def create_commandline(scene_data, generate_depth, exist_ok=False):
+    """
+    Create a commandline string to generate a scene.
+    """
+    def my_formatting(val):
+        if val is None or val == "":
+            return '""'
+        else:
+            return val
+    commandline = f"""python {__file__} --scene {my_formatting(scene_data.scene)}
+    --scene_dataset_config_file {my_formatting(scene_data.scene_dataset_config_file)}
+    --navmesh {my_formatting(scene_data.navmesh)}
+    --output_dir {my_formatting(scene_data.output_dir)}
+    --generate_depth {int(generate_depth)}
+    --exist_ok {int(exist_ok)}
+    """
+    commandline = " ".join(commandline.split())
+    return commandline
+if __name__ == "__main__":
+    os.umask(2)
+    parser = argparse.ArgumentParser(description="""Example of use -- listing commands to generate data for scenes available:
+    > python datasets/habitat_sim/generate_multiview_habitat_images.py --list_commands
+    """)
+    parser.add_argument("--output_dir", type=str, required=True)
+    parser.add_argument("--list_commands", action='store_true', help="list commandlines to run if true")
+    parser.add_argument("--scene", type=str, default="")
+    parser.add_argument("--scene_dataset_config_file", type=str, default="")
+    parser.add_argument("--navmesh", type=str, default="")
+    parser.add_argument("--generate_depth", type=int, default=1)
+    parser.add_argument("--exist_ok", type=int, default=0)
+    kwargs = dict(resolution=(256,256), hfov=60, views_count = 2, size=1000)
+    args = parser.parse_args()
+    generate_depth=bool(args.generate_depth)
+    exist_ok = bool(args.exist_ok)
+    if args.list_commands:
+        # Listing scenes available...
+        scenes_data = list_scenes_available(base_output_dir=args.output_dir)
+        for scene_data in scenes_data:
+            print(create_commandline(scene_data, generate_depth=generate_depth, exist_ok=exist_ok))
+    else:
+        if args.scene == "" or args.output_dir == "":
+            print("Missing scene or output dir argument!")
+            print(parser.format_help())
+        else:
+            generate_multiview_images_for_scene(scene=args.scene,
+                                                scene_dataset_config_file = args.scene_dataset_config_file,
+                                                navmesh = args.navmesh,
+                                                output_dir = args.output_dir,
+                                                exist_ok=exist_ok,
+                                                generate_depth=generate_depth,
+                                                **kwargs)

dust3r/croco/datasets_croco/habitat_sim/multiview_habitat_sim_generator.py ADDED Viewed

	@@ -0,0 +1,390 @@

+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+import os
+import numpy as np
+import quaternion
+import habitat_sim
+import json
+from sklearn.neighbors import NearestNeighbors
+import cv2
+# OpenCV to habitat camera convention transformation
+R_OPENCV2HABITAT = np.stack((habitat_sim.geo.RIGHT, -habitat_sim.geo.UP, habitat_sim.geo.FRONT), axis=0)
+R_HABITAT2OPENCV = R_OPENCV2HABITAT.T
+DEG2RAD = np.pi / 180
+def compute_camera_intrinsics(height, width, hfov):
+    f = width/2 / np.tan(hfov/2 * np.pi/180)
+    cu, cv = width/2, height/2
+    return f, cu, cv
+def compute_camera_pose_opencv_convention(camera_position, camera_orientation):
+    R_cam2world = quaternion.as_rotation_matrix(camera_orientation) @ R_OPENCV2HABITAT
+    t_cam2world = np.asarray(camera_position)
+    return R_cam2world, t_cam2world
+def compute_pointmap(depthmap, hfov):
+    """ Compute a HxWx3 pointmap in camera frame from a HxW depth map."""
+    height, width = depthmap.shape
+    f, cu, cv = compute_camera_intrinsics(height, width, hfov)
+    # Cast depth map to point
+    z_cam = depthmap
+    u, v = np.meshgrid(range(width), range(height))
+    x_cam = (u - cu) / f * z_cam
+    y_cam = (v - cv) / f * z_cam
+    X_cam = np.stack((x_cam, y_cam, z_cam), axis=-1)
+    return X_cam
+def compute_pointcloud(depthmap, hfov, camera_position, camera_rotation):
+    """Return a 3D point cloud corresponding to valid pixels of the depth map"""
+    R_cam2world, t_cam2world = compute_camera_pose_opencv_convention(camera_position, camera_rotation)
+    X_cam = compute_pointmap(depthmap=depthmap, hfov=hfov)
+    valid_mask = (X_cam[:,:,2] != 0.0)
+    X_cam = X_cam.reshape(-1, 3)[valid_mask.flatten()]
+    X_world = X_cam @ R_cam2world.T + t_cam2world.reshape(1, 3)
+    return X_world
+def compute_pointcloud_overlaps_scikit(pointcloud1, pointcloud2, distance_threshold, compute_symmetric=False):
+    """
+    Compute 'overlapping' metrics based on a distance threshold between two point clouds.
+    """
+    nbrs = NearestNeighbors(n_neighbors=1, algorithm = 'kd_tree').fit(pointcloud2)
+    distances, indices = nbrs.kneighbors(pointcloud1)
+    intersection1 = np.count_nonzero(distances.flatten() < distance_threshold)
+    data = {"intersection1": intersection1,
+            "size1": len(pointcloud1)}
+    if compute_symmetric:
+        nbrs = NearestNeighbors(n_neighbors=1, algorithm = 'kd_tree').fit(pointcloud1)
+        distances, indices = nbrs.kneighbors(pointcloud2)
+        intersection2 = np.count_nonzero(distances.flatten() < distance_threshold)
+        data["intersection2"] = intersection2
+        data["size2"] = len(pointcloud2)
+    return data
+def _append_camera_parameters(observation, hfov, camera_location, camera_rotation):
+    """
+    Add camera parameters to the observation dictionnary produced by Habitat-Sim
+    In-place modifications.
+    """
+    R_cam2world, t_cam2world = compute_camera_pose_opencv_convention(camera_location, camera_rotation)
+    height, width = observation['depth'].shape
+    f, cu, cv = compute_camera_intrinsics(height, width, hfov)
+    K = np.asarray([[f, 0, cu],
+                    [0, f, cv],
+                    [0, 0, 1.0]])
+    observation["camera_intrinsics"] = K
+    observation["t_cam2world"] = t_cam2world
+    observation["R_cam2world"] = R_cam2world
+def look_at(eye, center, up, return_cam2world=True):
+    """
+    Return camera pose looking at a given center point.
+    Analogous of gluLookAt function, using OpenCV camera convention.
+    """
+    z = center - eye
+    z /= np.linalg.norm(z, axis=-1, keepdims=True)
+    y = -up
+    y = y - np.sum(y * z, axis=-1, keepdims=True) * z
+    y /= np.linalg.norm(y, axis=-1, keepdims=True)
+    x = np.cross(y, z, axis=-1)
+    if return_cam2world:
+        R = np.stack((x, y, z), axis=-1)
+        t = eye
+    else:
+        # World to camera transformation
+        # Transposed matrix
+        R = np.stack((x, y, z), axis=-2)
+        t = - np.einsum('...ij, ...j', R, eye)
+    return R, t
+def look_at_for_habitat(eye, center, up, return_cam2world=True):
+    R, t = look_at(eye, center, up)
+    orientation = quaternion.from_rotation_matrix(R @ R_OPENCV2HABITAT.T)
+    return orientation, t
+def generate_orientation_noise(pan_range, tilt_range, roll_range):
+    return (quaternion.from_rotation_vector(np.random.uniform(*pan_range) * DEG2RAD * habitat_sim.geo.UP)
+            * quaternion.from_rotation_vector(np.random.uniform(*tilt_range) * DEG2RAD * habitat_sim.geo.RIGHT)
+            * quaternion.from_rotation_vector(np.random.uniform(*roll_range) * DEG2RAD * habitat_sim.geo.FRONT))
+class NoNaviguableSpaceError(RuntimeError):
+    def __init__(self, *args):
+            super().__init__(*args)
+class MultiviewHabitatSimGenerator:
+    def __init__(self,
+                scene,
+                navmesh,
+                scene_dataset_config_file,
+                resolution = (240, 320),
+                views_count=2,
+                hfov = 60,
+                gpu_id = 0,
+                size = 10000,
+                minimum_covisibility = 0.5,
+                transform = None):
+        self.scene = scene
+        self.navmesh = navmesh
+        self.scene_dataset_config_file = scene_dataset_config_file
+        self.resolution = resolution
+        self.views_count = views_count
+        assert(self.views_count >= 1)
+        self.hfov = hfov
+        self.gpu_id = gpu_id
+        self.size = size
+        self.transform = transform
+        # Noise added to camera orientation
+        self.pan_range = (-3, 3)
+        self.tilt_range = (-10, 10)
+        self.roll_range = (-5, 5)
+        # Height range to sample cameras
+        self.height_range = (1.2, 1.8)
+        # Random steps between the camera views
+        self.random_steps_count = 5
+        self.random_step_variance = 2.0
+        # Minimum fraction of the scene which should be valid (well defined depth)
+        self.minimum_valid_fraction = 0.7
+        # Distance threshold to see  to select pairs
+        self.distance_threshold = 0.05
+        # Minimum IoU of a view point cloud with respect to the reference view to be kept.
+        self.minimum_covisibility = minimum_covisibility
+        # Maximum number of retries.
+        self.max_attempts_count = 100
+        self.seed = None
+        self._lazy_initialization()
+    def _lazy_initialization(self):
+        # Lazy random seeding and instantiation of the simulator to deal with multiprocessing properly
+        if self.seed == None:
+            # Re-seed numpy generator
+            np.random.seed()
+            self.seed = np.random.randint(2**32-1)
+            sim_cfg = habitat_sim.SimulatorConfiguration()
+            sim_cfg.scene_id = self.scene
+            if self.scene_dataset_config_file is not None and self.scene_dataset_config_file != "":
+                    sim_cfg.scene_dataset_config_file = self.scene_dataset_config_file
+            sim_cfg.random_seed = self.seed
+            sim_cfg.load_semantic_mesh = False
+            sim_cfg.gpu_device_id = self.gpu_id
+            depth_sensor_spec = habitat_sim.CameraSensorSpec()
+            depth_sensor_spec.uuid = "depth"
+            depth_sensor_spec.sensor_type = habitat_sim.SensorType.DEPTH
+            depth_sensor_spec.resolution = self.resolution
+            depth_sensor_spec.hfov = self.hfov
+            depth_sensor_spec.position = [0.0, 0.0, 0]
+            depth_sensor_spec.orientation
+            rgb_sensor_spec = habitat_sim.CameraSensorSpec()
+            rgb_sensor_spec.uuid = "color"
+            rgb_sensor_spec.sensor_type = habitat_sim.SensorType.COLOR
+            rgb_sensor_spec.resolution = self.resolution
+            rgb_sensor_spec.hfov = self.hfov
+            rgb_sensor_spec.position = [0.0, 0.0, 0]
+            agent_cfg = habitat_sim.agent.AgentConfiguration(sensor_specifications=[rgb_sensor_spec, depth_sensor_spec])
+            cfg = habitat_sim.Configuration(sim_cfg, [agent_cfg])
+            self.sim = habitat_sim.Simulator(cfg)
+            if self.navmesh is not None and self.navmesh != "":
+                # Use pre-computed navmesh when available (usually better than those generated automatically)
+                self.sim.pathfinder.load_nav_mesh(self.navmesh)
+            if not self.sim.pathfinder.is_loaded:
+                # Try to compute a navmesh
+                navmesh_settings = habitat_sim.NavMeshSettings()
+                navmesh_settings.set_defaults()
+                self.sim.recompute_navmesh(self.sim.pathfinder, navmesh_settings, True)
+            # Ensure that the navmesh is not empty
+            if not self.sim.pathfinder.is_loaded:
+                raise NoNaviguableSpaceError(f"No naviguable location (scene: {self.scene} -- navmesh: {self.navmesh})")
+            self.agent = self.sim.initialize_agent(agent_id=0)
+    def close(self):
+        self.sim.close()
+    def __del__(self):
+        self.sim.close()
+    def __len__(self):
+        return self.size
+    def sample_random_viewpoint(self):
+        """ Sample a random viewpoint using the navmesh """
+        nav_point = self.sim.pathfinder.get_random_navigable_point()
+        # Sample a random viewpoint height
+        viewpoint_height = np.random.uniform(*self.height_range)
+        viewpoint_position = nav_point + viewpoint_height * habitat_sim.geo.UP
+        viewpoint_orientation = quaternion.from_rotation_vector(np.random.uniform(0, 2 * np.pi) * habitat_sim.geo.UP) * generate_orientation_noise(self.pan_range, self.tilt_range, self.roll_range)
+        return viewpoint_position, viewpoint_orientation, nav_point
+    def sample_other_random_viewpoint(self, observed_point, nav_point):
+        """ Sample a random viewpoint close to an existing one, using the navmesh and a reference observed point."""
+        other_nav_point = nav_point
+        walk_directions = self.random_step_variance * np.asarray([1,0,1])
+        for i in range(self.random_steps_count):
+            temp = self.sim.pathfinder.snap_point(other_nav_point + walk_directions * np.random.normal(size=3))
+            # Snapping may return nan when it fails
+            if not np.isnan(temp[0]):
+                    other_nav_point = temp
+        other_viewpoint_height = np.random.uniform(*self.height_range)
+        other_viewpoint_position = other_nav_point + other_viewpoint_height * habitat_sim.geo.UP
+        # Set viewing direction towards the central point
+        rotation, position = look_at_for_habitat(eye=other_viewpoint_position, center=observed_point, up=habitat_sim.geo.UP, return_cam2world=True)
+        rotation = rotation * generate_orientation_noise(self.pan_range, self.tilt_range, self.roll_range)
+        return position, rotation, other_nav_point
+    def is_other_pointcloud_overlapping(self, ref_pointcloud, other_pointcloud):
+        """ Check if a viewpoint is valid and overlaps significantly with a reference one. """
+        # Observation
+        pixels_count = self.resolution[0] * self.resolution[1]
+        valid_fraction = len(other_pointcloud) / pixels_count
+        assert valid_fraction <= 1.0 and valid_fraction >= 0.0
+        overlap = compute_pointcloud_overlaps_scikit(ref_pointcloud, other_pointcloud, self.distance_threshold, compute_symmetric=True)
+        covisibility = min(overlap["intersection1"] / pixels_count, overlap["intersection2"] / pixels_count)
+        is_valid = (valid_fraction >= self.minimum_valid_fraction) and (covisibility >= self.minimum_covisibility)
+        return is_valid, valid_fraction, covisibility
+    def is_other_viewpoint_overlapping(self, ref_pointcloud, observation, position, rotation):
+        """ Check if a viewpoint is valid and overlaps significantly with a reference one. """
+        # Observation
+        other_pointcloud = compute_pointcloud(observation['depth'], self.hfov, position, rotation)
+        return self.is_other_pointcloud_overlapping(ref_pointcloud, other_pointcloud)
+    def render_viewpoint(self, viewpoint_position, viewpoint_orientation):
+        agent_state = habitat_sim.AgentState()
+        agent_state.position = viewpoint_position
+        agent_state.rotation = viewpoint_orientation
+        self.agent.set_state(agent_state)
+        viewpoint_observations = self.sim.get_sensor_observations(agent_ids=0)
+        _append_camera_parameters(viewpoint_observations, self.hfov, viewpoint_position, viewpoint_orientation)
+        return viewpoint_observations
+    def __getitem__(self, useless_idx):
+        ref_position, ref_orientation, nav_point = self.sample_random_viewpoint()
+        ref_observations = self.render_viewpoint(ref_position, ref_orientation)
+        # Extract point cloud
+        ref_pointcloud = compute_pointcloud(depthmap=ref_observations['depth'], hfov=self.hfov,
+                                        camera_position=ref_position, camera_rotation=ref_orientation)
+        pixels_count = self.resolution[0] * self.resolution[1]
+        ref_valid_fraction = len(ref_pointcloud) / pixels_count
+        assert ref_valid_fraction <= 1.0 and ref_valid_fraction >= 0.0
+        if ref_valid_fraction < self.minimum_valid_fraction:
+                # This should produce a recursion error at some point when something is very wrong.
+                return self[0]
+        # Pick an reference observed point in the point cloud
+        observed_point = np.mean(ref_pointcloud, axis=0)
+        # Add the first image as reference
+        viewpoints_observations = [ref_observations]
+        viewpoints_covisibility = [ref_valid_fraction]
+        viewpoints_positions = [ref_position]
+        viewpoints_orientations = [quaternion.as_float_array(ref_orientation)]
+        viewpoints_clouds = [ref_pointcloud]
+        viewpoints_valid_fractions = [ref_valid_fraction]
+        for _ in range(self.views_count - 1):
+            # Generate an other viewpoint using some dummy random walk
+            successful_sampling = False
+            for sampling_attempt in range(self.max_attempts_count):
+                position, rotation, _ = self.sample_other_random_viewpoint(observed_point, nav_point)
+                # Observation
+                other_viewpoint_observations = self.render_viewpoint(position, rotation)
+                other_pointcloud = compute_pointcloud(other_viewpoint_observations['depth'], self.hfov, position, rotation)
+                is_valid, valid_fraction, covisibility = self.is_other_pointcloud_overlapping(ref_pointcloud, other_pointcloud)
+                if is_valid:
+                        successful_sampling = True
+                        break
+            if not successful_sampling:
+                print("WARNING: Maximum number of attempts reached.")
+                # Dirty hack, try using a novel original viewpoint
+                return self[0]
+            viewpoints_observations.append(other_viewpoint_observations)
+            viewpoints_covisibility.append(covisibility)
+            viewpoints_positions.append(position)
+            viewpoints_orientations.append(quaternion.as_float_array(rotation)) # WXYZ convention for the quaternion encoding.
+            viewpoints_clouds.append(other_pointcloud)
+            viewpoints_valid_fractions.append(valid_fraction)
+        # Estimate relations between all pairs of images
+        pairwise_visibility_ratios = np.ones((len(viewpoints_observations), len(viewpoints_observations)))
+        for i in range(len(viewpoints_observations)):
+            pairwise_visibility_ratios[i,i] = viewpoints_valid_fractions[i]
+            for j in range(i+1, len(viewpoints_observations)):
+                overlap = compute_pointcloud_overlaps_scikit(viewpoints_clouds[i], viewpoints_clouds[j], self.distance_threshold, compute_symmetric=True)
+                pairwise_visibility_ratios[i,j] = overlap['intersection1'] / pixels_count
+                pairwise_visibility_ratios[j,i] = overlap['intersection2'] / pixels_count
+        # IoU is relative to the image 0
+        data = {"observations": viewpoints_observations,
+                "positions": np.asarray(viewpoints_positions),
+                "orientations": np.asarray(viewpoints_orientations),
+                "covisibility_ratios": np.asarray(viewpoints_covisibility),
+                "valid_fractions": np.asarray(viewpoints_valid_fractions, dtype=float),
+                "pairwise_visibility_ratios": np.asarray(pairwise_visibility_ratios, dtype=float),
+                }
+        if self.transform is not None:
+            data = self.transform(data)
+        return  data
+    def generate_random_spiral_trajectory(self, images_count = 100, max_radius=0.5, half_turns=5, use_constant_orientation=False):
+        """
+        Return a list of images corresponding to a spiral trajectory from a random starting point.
+        Useful to generate nice visualisations.
+        Use an even number of half turns to get a nice "C1-continuous" loop effect
+        """
+        ref_position, ref_orientation, navpoint = self.sample_random_viewpoint()
+        ref_observations = self.render_viewpoint(ref_position, ref_orientation)
+        ref_pointcloud = compute_pointcloud(depthmap=ref_observations['depth'], hfov=self.hfov,
+                                                        camera_position=ref_position, camera_rotation=ref_orientation)
+        pixels_count = self.resolution[0] * self.resolution[1]
+        if len(ref_pointcloud) / pixels_count < self.minimum_valid_fraction:
+            # Dirty hack: ensure that the valid part of the image is significant
+            return self.generate_random_spiral_trajectory(images_count, max_radius, half_turns, use_constant_orientation)
+        # Pick an observed point in the point cloud
+        observed_point = np.mean(ref_pointcloud, axis=0)
+        ref_R, ref_t = compute_camera_pose_opencv_convention(ref_position, ref_orientation)
+        images = []
+        is_valid = []
+        # Spiral trajectory, use_constant orientation
+        for i, alpha in enumerate(np.linspace(0, 1, images_count)):
+            r = max_radius * np.abs(np.sin(alpha * np.pi)) # Increase then decrease the radius
+            theta = alpha * half_turns * np.pi
+            x = r * np.cos(theta)
+            y = r * np.sin(theta)
+            z = 0.0
+            position = ref_position + (ref_R @ np.asarray([x, y, z]).reshape(3,1)).flatten()
+            if use_constant_orientation:
+                orientation = ref_orientation
+            else:
+                # trajectory looking at a mean point in front of the ref observation
+                orientation, position = look_at_for_habitat(eye=position, center=observed_point, up=habitat_sim.geo.UP)
+            observations = self.render_viewpoint(position, orientation)
+            images.append(observations['color'][...,:3])
+            _is_valid, valid_fraction, iou = self.is_other_viewpoint_overlapping(ref_pointcloud, observations, position, orientation)
+            is_valid.append(_is_valid)
+        return images, np.all(is_valid)

dust3r/croco/datasets_croco/habitat_sim/pack_metadata_files.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+"""
+Utility script to pack metadata files of the dataset in order to be able to re-generate it elsewhere.
+"""
+import os
+import glob
+from tqdm import tqdm
+import shutil
+import json
+from datasets.habitat_sim.paths import *
+import argparse
+import collections
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input_dir")
+    parser.add_argument("output_dir")
+    args = parser.parse_args()
+    input_dirname = args.input_dir
+    output_dirname = args.output_dir
+    input_metadata_filenames = glob.iglob(f"{input_dirname}/**/metadata.json", recursive=True)
+    images_count = collections.defaultdict(lambda : 0)
+    os.makedirs(output_dirname)
+    for input_filename in tqdm(input_metadata_filenames):
+        # Ignore empty files
+        with open(input_filename, "r") as f:
+            original_metadata = json.load(f)
+            if "multiviews" not in original_metadata or len(original_metadata["multiviews"]) == 0:
+                print("No views in", input_filename)
+                continue
+        relpath = os.path.relpath(input_filename, input_dirname)
+        print(relpath)
+        # Copy metadata, while replacing scene paths by generic keys depending on the dataset, for portability.
+        # Data paths are sorted by decreasing length to avoid potential bugs due to paths starting by the same string pattern.
+        scenes_dataset_paths = dict(sorted(SCENES_DATASET.items(), key=lambda x: len(x[1]), reverse=True))
+        metadata = dict()
+        for key, value in original_metadata.items():
+            if key in ("scene_dataset_config_file", "scene", "navmesh") and value != "":
+                known_path = False
+                for dataset, dataset_path in scenes_dataset_paths.items():
+                    if value.startswith(dataset_path):
+                        value = os.path.join(dataset, os.path.relpath(value, dataset_path))
+                        known_path = True
+                        break
+                if not known_path:
+                    raise KeyError("Unknown path:" + value)
+            metadata[key] = value
+        # Compile some general statistics while packing data
+        scene_split = metadata["scene"].split("/")
+        upper_level = "/".join(scene_split[:2]) if scene_split[0] == "hm3d" else scene_split[0]
+        images_count[upper_level] += len(metadata["multiviews"])
+        output_filename = os.path.join(output_dirname, relpath)
+        os.makedirs(os.path.dirname(output_filename), exist_ok=True)
+        with open(output_filename, "w") as f:
+            json.dump(metadata, f)
+    # Print statistics
+    print("Images count:")
+    for upper_level, count in images_count.items():
+        print(f"- {upper_level}: {count}")

dust3r/croco/datasets_croco/habitat_sim/paths.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+"""
+Paths to Habitat-Sim scenes
+"""
+import os
+import json
+import collections
+from tqdm import tqdm
+# Hardcoded path to the different scene datasets
+SCENES_DATASET = {
+    "hm3d": "./data/habitat-sim-data/scene_datasets/hm3d/",
+    "gibson": "./data/habitat-sim-data/scene_datasets/gibson/",
+    "habitat-test-scenes": "./data/habitat-sim/scene_datasets/habitat-test-scenes/",
+    "replica_cad_baked_lighting": "./data/habitat-sim/scene_datasets/replica_cad_baked_lighting/",
+    "replica_cad": "./data/habitat-sim/scene_datasets/replica_cad/",
+    "replica": "./data/habitat-sim/scene_datasets/ReplicaDataset/",
+    "scannet": "./data/habitat-sim/scene_datasets/scannet/"
+}
+SceneData = collections.namedtuple("SceneData", ["scene_dataset_config_file", "scene", "navmesh", "output_dir"])
+def list_replicacad_scenes(base_output_dir, base_path=SCENES_DATASET["replica_cad"]):
+    scene_dataset_config_file = os.path.join(base_path, "replicaCAD.scene_dataset_config.json")
+    scenes = [f"apt_{i}" for i in range(6)] + ["empty_stage"]
+    navmeshes = [f"navmeshes/apt_{i}_static_furniture.navmesh" for i in range(6)] + ["empty_stage.navmesh"]
+    scenes_data = []
+    for idx in range(len(scenes)):
+        output_dir = os.path.join(base_output_dir, "ReplicaCAD", scenes[idx])
+        # Add scene
+        data = SceneData(scene_dataset_config_file=scene_dataset_config_file,
+                    scene = scenes[idx] + ".scene_instance.json",
+                    navmesh = os.path.join(base_path, navmeshes[idx]),
+                    output_dir = output_dir)
+        scenes_data.append(data)
+    return scenes_data
+def list_replica_cad_baked_lighting_scenes(base_output_dir, base_path=SCENES_DATASET["replica_cad_baked_lighting"]):
+    scene_dataset_config_file = os.path.join(base_path, "replicaCAD_baked.scene_dataset_config.json")
+    scenes = sum([[f"Baked_sc{i}_staging_{j:02}" for i in range(5)] for j in range(21)], [])
+    navmeshes = ""#[f"navmeshes/apt_{i}_static_furniture.navmesh" for i in range(6)] + ["empty_stage.navmesh"]
+    scenes_data = []
+    for idx in range(len(scenes)):
+        output_dir = os.path.join(base_output_dir, "replica_cad_baked_lighting", scenes[idx])
+        data = SceneData(scene_dataset_config_file=scene_dataset_config_file,
+                    scene = scenes[idx],
+                    navmesh = "",
+                    output_dir = output_dir)
+        scenes_data.append(data)
+    return scenes_data
+def list_replica_scenes(base_output_dir, base_path):
+    scenes_data = []
+    for scene_id in os.listdir(base_path):
+        scene = os.path.join(base_path, scene_id, "mesh.ply")
+        navmesh = os.path.join(base_path, scene_id, "habitat/mesh_preseg_semantic.navmesh") # Not sure if I should use it
+        scene_dataset_config_file = ""
+        output_dir = os.path.join(base_output_dir, scene_id)
+        # Add scene only if it does not exist already, or if exist_ok
+        data = SceneData(scene_dataset_config_file = scene_dataset_config_file,
+                    scene = scene,
+                    navmesh = navmesh,
+                    output_dir = output_dir)
+        scenes_data.append(data)
+    return scenes_data
+def list_scenes(base_output_dir, base_path):
+    """
+    Generic method iterating through a base_path folder to find scenes.
+    """
+    scenes_data = []
+    for root, dirs, files in os.walk(base_path, followlinks=True):
+        folder_scenes_data = []
+        for file in files:
+            name, ext = os.path.splitext(file)
+            if ext == ".glb":
+                scene = os.path.join(root, name + ".glb")
+                navmesh = os.path.join(root, name + ".navmesh")
+                if not os.path.exists(navmesh):
+                    navmesh = ""
+                relpath = os.path.relpath(root, base_path)
+                output_dir = os.path.abspath(os.path.join(base_output_dir, relpath, name))
+                data = SceneData(scene_dataset_config_file="",
+                    scene = scene,
+                    navmesh = navmesh,
+                    output_dir = output_dir)
+                folder_scenes_data.append(data)
+        # Specific check for HM3D:
+        # When two meshesxxxx.basis.glb and xxxx.glb are present, use the 'basis' version.
+        basis_scenes = [data.scene[:-len(".basis.glb")] for data in folder_scenes_data if data.scene.endswith(".basis.glb")]
+        if len(basis_scenes) != 0:
+            folder_scenes_data = [data for data in folder_scenes_data if not (data.scene[:-len(".glb")] in basis_scenes)]
+        scenes_data.extend(folder_scenes_data)
+    return scenes_data
+def list_scenes_available(base_output_dir, scenes_dataset_paths=SCENES_DATASET):
+    scenes_data = []
+    # HM3D
+    for split in ("minival", "train", "val", "examples"):
+        scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, f"hm3d/{split}/"),
+                                    base_path=f"{scenes_dataset_paths['hm3d']}/{split}")
+    # Gibson
+    scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, "gibson"),
+                                base_path=scenes_dataset_paths["gibson"])
+    # Habitat test scenes (just a few)
+    scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, "habitat-test-scenes"),
+                                base_path=scenes_dataset_paths["habitat-test-scenes"])
+    # ReplicaCAD (baked lightning)
+    scenes_data += list_replica_cad_baked_lighting_scenes(base_output_dir=base_output_dir)
+    # ScanNet
+    scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, "scannet"),
+                            base_path=scenes_dataset_paths["scannet"])
+    # Replica
+    list_replica_scenes(base_output_dir=os.path.join(base_output_dir, "replica"),
+                        base_path=scenes_dataset_paths["replica"])
+    return scenes_data

dust3r/croco/datasets_croco/pairs_dataset.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+import os
+from torch.utils.data import Dataset
+from PIL import Image
+from datasets.transforms import get_pair_transforms
+def load_image(impath):
+    return Image.open(impath)
+def load_pairs_from_cache_file(fname, root=''):
+    assert os.path.isfile(fname), "cannot parse pairs from {:s}, file does not exist".format(fname)
+    with open(fname, 'r') as fid:
+        lines = fid.read().strip().splitlines()
+    pairs = [ (os.path.join(root,l.split()[0]), os.path.join(root,l.split()[1])) for l in lines]
+    return pairs
+def load_pairs_from_list_file(fname, root=''):
+    assert os.path.isfile(fname), "cannot parse pairs from {:s}, file does not exist".format(fname)
+    with open(fname, 'r') as fid:
+        lines = fid.read().strip().splitlines()
+    pairs = [ (os.path.join(root,l+'_1.jpg'), os.path.join(root,l+'_2.jpg')) for l in lines if not l.startswith('#')]
+    return pairs
+def write_cache_file(fname, pairs, root=''):
+    if len(root)>0:
+        if not root.endswith('/'): root+='/'
+        assert os.path.isdir(root)
+    s = ''
+    for im1, im2 in pairs:
+        if len(root)>0:
+            assert im1.startswith(root), im1
+            assert im2.startswith(root), im2
+        s += '{:s} {:s}\n'.format(im1[len(root):], im2[len(root):])
+    with open(fname, 'w') as fid:
+        fid.write(s[:-1])
+def parse_and_cache_all_pairs(dname, data_dir='./data/'):
+    if dname=='habitat_release':
+        dirname = os.path.join(data_dir, 'habitat_release')
+        assert os.path.isdir(dirname), "cannot find folder for habitat_release pairs: "+dirname
+        cache_file = os.path.join(dirname, 'pairs.txt')
+        assert not os.path.isfile(cache_file), "cache file already exists: "+cache_file
+        print('Parsing pairs for dataset: '+dname)
+        pairs = []
+        for root, dirs, files in os.walk(dirname):
+            if 'val' in root: continue
+            dirs.sort()
+            pairs += [ (os.path.join(root,f), os.path.join(root,f[:-len('_1.jpeg')]+'_2.jpeg')) for f in sorted(files) if f.endswith('_1.jpeg')]
+        print('Found {:,} pairs'.format(len(pairs)))
+        print('Writing cache to: '+cache_file)
+        write_cache_file(cache_file, pairs, root=dirname)
+    else:
+        raise NotImplementedError('Unknown dataset: '+dname)
+def dnames_to_image_pairs(dnames, data_dir='./data/'):
+    """
+    dnames: list of datasets with image pairs, separated by +
+    """
+    all_pairs = []
+    for dname in dnames.split('+'):
+        if dname=='habitat_release':
+            dirname = os.path.join(data_dir, 'habitat_release')
+            assert os.path.isdir(dirname), "cannot find folder for habitat_release pairs: "+dirname
+            cache_file = os.path.join(dirname, 'pairs.txt')
+            assert os.path.isfile(cache_file), "cannot find cache file for habitat_release pairs, please first create the cache file, see instructions. "+cache_file
+            pairs = load_pairs_from_cache_file(cache_file, root=dirname)
+        elif dname in ['ARKitScenes', 'MegaDepth', '3DStreetView', 'IndoorVL']:
+            dirname = os.path.join(data_dir, dname+'_crops')
+            assert os.path.isdir(dirname), "cannot find folder for {:s} pairs: {:s}".format(dname, dirname)
+            list_file = os.path.join(dirname, 'listing.txt')
+            assert os.path.isfile(list_file), "cannot find list file for {:s} pairs, see instructions. {:s}".format(dname, list_file)
+            pairs = load_pairs_from_list_file(list_file, root=dirname)
+        print('  {:s}: {:,} pairs'.format(dname, len(pairs)))
+        all_pairs += pairs
+    if '+' in dnames: print(' Total: {:,} pairs'.format(len(all_pairs)))
+    return all_pairs
+class PairsDataset(Dataset):
+    def __init__(self, dnames, trfs='', totensor=True, normalize=True, data_dir='./data/'):
+        super().__init__()
+        self.image_pairs = dnames_to_image_pairs(dnames, data_dir=data_dir)
+        self.transforms = get_pair_transforms(transform_str=trfs, totensor=totensor, normalize=normalize)
+    def __len__(self):
+        return len(self.image_pairs)
+    def __getitem__(self, index):
+        im1path, im2path = self.image_pairs[index]
+        im1 = load_image(im1path)
+        im2 = load_image(im2path)
+        if self.transforms is not None: im1, im2 = self.transforms(im1, im2)
+        return im1, im2
+if __name__=="__main__":
+    import argparse
+    parser = argparse.ArgumentParser(prog="Computing and caching list of pairs for a given dataset")
+    parser.add_argument('--data_dir', default='./data/', type=str, help="path where data are stored")
+    parser.add_argument('--dataset', default='habitat_release', type=str, help="name of the dataset")
+    args = parser.parse_args()
+    parse_and_cache_all_pairs(dname=args.dataset, data_dir=args.data_dir)

dust3r/croco/datasets_croco/transforms.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+import torch
+import torchvision.transforms
+import torchvision.transforms.functional as F
+# "Pair": apply a transform on a pair
+# "Both": apply the exact same transform to both images
+class ComposePair(torchvision.transforms.Compose):
+    def __call__(self, img1, img2):
+        for t in self.transforms:
+            img1, img2 = t(img1, img2)
+        return img1, img2
+class NormalizeBoth(torchvision.transforms.Normalize):
+    def forward(self, img1, img2):
+        img1 = super().forward(img1)
+        img2 = super().forward(img2)
+        return img1, img2
+class ToTensorBoth(torchvision.transforms.ToTensor):
+    def __call__(self, img1, img2):
+        img1 = super().__call__(img1)
+        img2 = super().__call__(img2)
+        return img1, img2
+class RandomCropPair(torchvision.transforms.RandomCrop):
+    # the crop will be intentionally different for the two images with this class
+    def forward(self, img1, img2):
+        img1 = super().forward(img1)
+        img2 = super().forward(img2)
+        return img1, img2
+class ColorJitterPair(torchvision.transforms.ColorJitter):
+    # can be symmetric (same for both images) or assymetric (different jitter params for each image) depending on assymetric_prob
+    def __init__(self, assymetric_prob, **kwargs):
+        super().__init__(**kwargs)
+        self.assymetric_prob = assymetric_prob
+    def jitter_one(self, img, fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor):
+        for fn_id in fn_idx:
+            if fn_id == 0 and brightness_factor is not None:
+                img = F.adjust_brightness(img, brightness_factor)
+            elif fn_id == 1 and contrast_factor is not None:
+                img = F.adjust_contrast(img, contrast_factor)
+            elif fn_id == 2 and saturation_factor is not None:
+                img = F.adjust_saturation(img, saturation_factor)
+            elif fn_id == 3 and hue_factor is not None:
+                img = F.adjust_hue(img, hue_factor)
+        return img
+    def forward(self, img1, img2):
+        fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor = self.get_params(
+            self.brightness, self.contrast, self.saturation, self.hue
+        )
+        img1 = self.jitter_one(img1, fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor)
+        if torch.rand(1) < self.assymetric_prob: # assymetric:
+            fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor = self.get_params(
+                self.brightness, self.contrast, self.saturation, self.hue
+            )
+        img2 = self.jitter_one(img2, fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor)
+        return img1, img2
+def get_pair_transforms(transform_str, totensor=True, normalize=True):
+    # transform_str is eg    crop224+color
+    trfs = []
+    for s in transform_str.split('+'):
+        if s.startswith('crop'):
+            size = int(s[len('crop'):])
+            trfs.append(RandomCropPair(size))
+        elif s=='acolor':
+            trfs.append(ColorJitterPair(assymetric_prob=1.0, brightness=(0.6, 1.4), contrast=(0.6, 1.4), saturation=(0.6, 1.4), hue=0.0))
+        elif s=='': # if transform_str was ""
+            pass
+        else:
+            raise NotImplementedError('Unknown augmentation: '+s)
+    if totensor:
+        trfs.append( ToTensorBoth() )
+    if normalize:
+        trfs.append( NormalizeBoth(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) )
+    if len(trfs)==0:
+        return None
+    elif len(trfs)==1:
+        return trfs
+    else:
+        return ComposePair(trfs)

dust3r/croco/demo.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+import torch
+from models.croco import CroCoNet
+from PIL import Image
+import torchvision.transforms
+from torchvision.transforms import ToTensor, Normalize, Compose
+def main():
+    device = torch.device('cuda:0' if torch.cuda.is_available() and torch.cuda.device_count()>0 else 'cpu')
+    # load 224x224 images and transform them to tensor
+    imagenet_mean = [0.485, 0.456, 0.406]
+    imagenet_mean_tensor = torch.tensor(imagenet_mean).view(1,3,1,1).to(device, non_blocking=True)
+    imagenet_std = [0.229, 0.224, 0.225]
+    imagenet_std_tensor = torch.tensor(imagenet_std).view(1,3,1,1).to(device, non_blocking=True)
+    trfs = Compose([ToTensor(), Normalize(mean=imagenet_mean, std=imagenet_std)])
+    image1 = trfs(Image.open('assets/Chateau1.png').convert('RGB')).to(device, non_blocking=True).unsqueeze(0)
+    image2 = trfs(Image.open('assets/Chateau2.png').convert('RGB')).to(device, non_blocking=True).unsqueeze(0)
+    # load model
+    ckpt = torch.load('pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth', 'cpu')
+    model = CroCoNet( **ckpt.get('croco_kwargs',{})).to(device)
+    model.eval()
+    msg = model.load_state_dict(ckpt['model'], strict=True)
+    # forward
+    with torch.inference_mode():
+        out, mask, target = model(image1, image2)
+    # the output is normalized, thus use the mean/std of the actual image to go back to RGB space
+    patchified = model.patchify(image1)
+    mean = patchified.mean(dim=-1, keepdim=True)
+    var = patchified.var(dim=-1, keepdim=True)
+    decoded_image = model.unpatchify(out * (var + 1.e-6)**.5 + mean)
+    # undo imagenet normalization, prepare masked image
+    decoded_image = decoded_image * imagenet_std_tensor + imagenet_mean_tensor
+    input_image = image1 * imagenet_std_tensor + imagenet_mean_tensor
+    ref_image = image2 * imagenet_std_tensor + imagenet_mean_tensor
+    image_masks = model.unpatchify(model.patchify(torch.ones_like(ref_image)) * mask[:,:,None])
+    masked_input_image = ((1 - image_masks) * input_image)
+    # make visualization
+    visualization = torch.cat((ref_image, masked_input_image, decoded_image, input_image), dim=3) # 4*(B, 3, H, W) -> B, 3, H, W*4
+    B, C, H, W = visualization.shape
+    visualization = visualization.permute(1, 0, 2, 3).reshape(C, B*H, W)
+    visualization = torchvision.transforms.functional.to_pil_image(torch.clamp(visualization, 0, 1))
+    fname = "demo_output.png"
+    visualization.save(fname)
+    print('Visualization save in '+fname)
+if __name__=="__main__":
+    main()

dust3r/croco/interactive_demo.ipynb ADDED Viewed

	@@ -0,0 +1,271 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Interactive demo of Cross-view Completion."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Copyright (C) 2022-present Naver Corporation. All rights reserved.\n",
+    "# Licensed under CC BY-NC-SA 4.0 (non-commercial use only)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import numpy as np\n",
+    "from models.croco import CroCoNet\n",
+    "from ipywidgets import interact, interactive, fixed, interact_manual\n",
+    "import ipywidgets as widgets\n",
+    "import matplotlib.pyplot as plt\n",
+    "import quaternion\n",
+    "import models.masking"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load CroCo model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ckpt = torch.load('pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth', 'cpu')\n",
+    "model = CroCoNet( **ckpt.get('croco_kwargs',{}))\n",
+    "msg = model.load_state_dict(ckpt['model'], strict=True)\n",
+    "use_gpu = torch.cuda.is_available() and torch.cuda.device_count()>0\n",
+    "device = torch.device('cuda:0' if use_gpu else 'cpu')\n",
+    "model = model.eval()\n",
+    "model = model.to(device=device)\n",
+    "print(msg)\n",
+    "\n",
+    "def process_images(ref_image, target_image, masking_ratio, reconstruct_unmasked_patches=False):\n",
+    "    \"\"\"\n",
+    "    Perform Cross-View completion using two input images, specified using Numpy arrays.\n",
+    "    \"\"\"\n",
+    "    # Replace the mask generator\n",
+    "    model.mask_generator = models.masking.RandomMask(model.patch_embed.num_patches, masking_ratio)\n",
+    "\n",
+    "    # ImageNet-1k color normalization\n",
+    "    imagenet_mean = torch.as_tensor([0.485, 0.456, 0.406]).reshape(1,3,1,1).to(device)\n",
+    "    imagenet_std = torch.as_tensor([0.229, 0.224, 0.225]).reshape(1,3,1,1).to(device)\n",
+    "\n",
+    "    normalize_input_colors = True\n",
+    "    is_output_normalized = True\n",
+    "    with torch.no_grad():\n",
+    "        # Cast data to torch\n",
+    "        target_image = (torch.as_tensor(target_image, dtype=torch.float, device=device).permute(2,0,1) / 255)[None]\n",
+    "        ref_image = (torch.as_tensor(ref_image, dtype=torch.float, device=device).permute(2,0,1) / 255)[None]\n",
+    "\n",
+    "        if normalize_input_colors:\n",
+    "            ref_image = (ref_image - imagenet_mean) / imagenet_std\n",
+    "            target_image = (target_image - imagenet_mean) / imagenet_std\n",
+    "\n",
+    "        out, mask, _ = model(target_image, ref_image)\n",
+    "        # # get target\n",
+    "        if not is_output_normalized:\n",
+    "            predicted_image = model.unpatchify(out)\n",
+    "        else:\n",
+    "            # The output only contains higher order information,\n",
+    "            # we retrieve mean and standard deviation from the actual target image\n",
+    "            patchified = model.patchify(target_image)\n",
+    "            mean = patchified.mean(dim=-1, keepdim=True)\n",
+    "            var = patchified.var(dim=-1, keepdim=True)\n",
+    "            pred_renorm = out * (var + 1.e-6)**.5 + mean\n",
+    "            predicted_image = model.unpatchify(pred_renorm)\n",
+    "\n",
+    "        image_masks = model.unpatchify(model.patchify(torch.ones_like(ref_image)) * mask[:,:,None])\n",
+    "        masked_target_image = (1 - image_masks) * target_image\n",
+    "      \n",
+    "        if not reconstruct_unmasked_patches:\n",
+    "            # Replace unmasked patches by their actual values\n",
+    "            predicted_image = predicted_image * image_masks + masked_target_image\n",
+    "\n",
+    "        # Unapply color normalization\n",
+    "        if normalize_input_colors:\n",
+    "            predicted_image = predicted_image * imagenet_std + imagenet_mean\n",
+    "            masked_target_image = masked_target_image * imagenet_std + imagenet_mean\n",
+    "        \n",
+    "        # Cast to Numpy\n",
+    "        masked_target_image = np.asarray(torch.clamp(masked_target_image.squeeze(0).permute(1,2,0) * 255, 0, 255).cpu().numpy(), dtype=np.uint8)\n",
+    "        predicted_image = np.asarray(torch.clamp(predicted_image.squeeze(0).permute(1,2,0) * 255, 0, 255).cpu().numpy(), dtype=np.uint8)\n",
+    "        return masked_target_image, predicted_image"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Use the Habitat simulator to render images from arbitrary viewpoints (requires habitat_sim to be installed)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ[\"MAGNUM_LOG\"]=\"quiet\"\n",
+    "os.environ[\"HABITAT_SIM_LOG\"]=\"quiet\"\n",
+    "import habitat_sim\n",
+    "\n",
+    "scene = \"habitat-sim-data/scene_datasets/habitat-test-scenes/skokloster-castle.glb\"\n",
+    "navmesh = \"habitat-sim-data/scene_datasets/habitat-test-scenes/skokloster-castle.navmesh\"\n",
+    "\n",
+    "sim_cfg = habitat_sim.SimulatorConfiguration()\n",
+    "if use_gpu: sim_cfg.gpu_device_id = 0\n",
+    "sim_cfg.scene_id = scene\n",
+    "sim_cfg.load_semantic_mesh = False\n",
+    "rgb_sensor_spec = habitat_sim.CameraSensorSpec()\n",
+    "rgb_sensor_spec.uuid = \"color\"\n",
+    "rgb_sensor_spec.sensor_type = habitat_sim.SensorType.COLOR\n",
+    "rgb_sensor_spec.resolution = (224,224)\n",
+    "rgb_sensor_spec.hfov = 56.56\n",
+    "rgb_sensor_spec.position = [0.0, 0.0, 0.0]\n",
+    "rgb_sensor_spec.orientation = [0, 0, 0]\n",
+    "agent_cfg = habitat_sim.agent.AgentConfiguration(sensor_specifications=[rgb_sensor_spec])\n",
+    "\n",
+    "\n",
+    "cfg = habitat_sim.Configuration(sim_cfg, [agent_cfg])\n",
+    "sim = habitat_sim.Simulator(cfg)\n",
+    "if navmesh is not None:\n",
+    "    sim.pathfinder.load_nav_mesh(navmesh)\n",
+    "agent = sim.initialize_agent(agent_id=0)\n",
+    "\n",
+    "def sample_random_viewpoint():\n",
+    "    \"\"\" Sample a random viewpoint using the navmesh \"\"\"\n",
+    "    nav_point = sim.pathfinder.get_random_navigable_point()\n",
+    "    # Sample a random viewpoint height\n",
+    "    viewpoint_height = np.random.uniform(1.0, 1.6)\n",
+    "    viewpoint_position = nav_point + viewpoint_height * habitat_sim.geo.UP\n",
+    "    viewpoint_orientation = quaternion.from_rotation_vector(np.random.uniform(-np.pi, np.pi) * habitat_sim.geo.UP)\n",
+    "    return viewpoint_position, viewpoint_orientation\n",
+    "\n",
+    "def render_viewpoint(position, orientation):\n",
+    "    agent_state = habitat_sim.AgentState()\n",
+    "    agent_state.position = position\n",
+    "    agent_state.rotation = orientation\n",
+    "    agent.set_state(agent_state)\n",
+    "    viewpoint_observations = sim.get_sensor_observations(agent_ids=0)\n",
+    "    image = viewpoint_observations['color'][:,:,:3]\n",
+    "    image = np.asarray(np.clip(1.5 * np.asarray(image, dtype=float), 0, 255), dtype=np.uint8)\n",
+    "    return image"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Sample a random reference view"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ref_position, ref_orientation = sample_random_viewpoint()\n",
+    "ref_image = render_viewpoint(ref_position, ref_orientation)\n",
+    "plt.clf()\n",
+    "fig, axes = plt.subplots(1,1, squeeze=False, num=1)\n",
+    "axes[0,0].imshow(ref_image)\n",
+    "for ax in axes.flatten():\n",
+    "    ax.set_xticks([])\n",
+    "    ax.set_yticks([])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Interactive cross-view completion using CroCo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reconstruct_unmasked_patches = False\n",
+    "\n",
+    "def show_demo(masking_ratio, x, y, z, panorama, elevation):\n",
+    "    R = quaternion.as_rotation_matrix(ref_orientation)\n",
+    "    target_position = ref_position + x * R[:,0] + y * R[:,1] + z * R[:,2]\n",
+    "    target_orientation = (ref_orientation\n",
+    "         * quaternion.from_rotation_vector(-elevation * np.pi/180 * habitat_sim.geo.LEFT) \n",
+    "         * quaternion.from_rotation_vector(-panorama * np.pi/180 * habitat_sim.geo.UP))\n",
+    "    \n",
+    "    ref_image = render_viewpoint(ref_position, ref_orientation)\n",
+    "    target_image = render_viewpoint(target_position, target_orientation)\n",
+    "\n",
+    "    masked_target_image, predicted_image = process_images(ref_image, target_image, masking_ratio, reconstruct_unmasked_patches)\n",
+    "\n",
+    "    fig, axes = plt.subplots(1,4, squeeze=True, dpi=300)\n",
+    "    axes[0].imshow(ref_image)\n",
+    "    axes[0].set_xlabel(\"Reference\")\n",
+    "    axes[1].imshow(masked_target_image)\n",
+    "    axes[1].set_xlabel(\"Masked target\")\n",
+    "    axes[2].imshow(predicted_image)\n",
+    "    axes[2].set_xlabel(\"Reconstruction\")        \n",
+    "    axes[3].imshow(target_image)\n",
+    "    axes[3].set_xlabel(\"Target\")\n",
+    "    for ax in axes.flatten():\n",
+    "        ax.set_xticks([])\n",
+    "        ax.set_yticks([])\n",
+    "\n",
+    "interact(show_demo,\n",
+    "        masking_ratio=widgets.FloatSlider(description='masking', value=0.9, min=0.0, max=1.0),\n",
+    "        x=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n",
+    "        y=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n",
+    "        z=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n",
+    "        panorama=widgets.FloatSlider(value=0.0, min=-20, max=20, step=0.5),\n",
+    "        elevation=widgets.FloatSlider(value=0.0, min=-20, max=20, step=0.5));"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.13"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "f9237820cd248d7e07cb4fb9f0e4508a85d642f19d831560c0a4b61f3e907e67"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}