rakib72642 commited on Nov 10, 2024

Commit

985cbbd

1 Parent(s): f363644

init commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

INSTALL.md +0 -34
README.md +0 -405
cog.yaml +0 -24
cutler/__init__.py +0 -15
cutler/config/__init__.py +0 -3
cutler/config/cutler_config.py +0 -19
cutler/data/__init__.py +0 -15
cutler/data/build.py +0 -561
cutler/data/dataset_mapper.py +0 -193
cutler/data/datasets/__init__.py +0 -16
cutler/data/datasets/builtin.py +0 -216
cutler/data/datasets/builtin_meta.py +0 -389
cutler/data/datasets/coco.py +0 -544
cutler/data/detection_utils.py +0 -650
cutler/data/transforms/__init__.py +0 -15
cutler/data/transforms/augmentation_impl.py +0 -616
cutler/data/transforms/transform.py +0 -355
cutler/demo/__init__.py +0 -5
cutler/demo/demo.py +0 -197
cutler/demo/predictor.py +0 -219
cutler/engine/__init__.py +0 -7
cutler/engine/defaults.py +0 -726
cutler/engine/train_loop.py +0 -360
cutler/evaluation/__init__.py +0 -3
cutler/evaluation/coco_evaluation.py +0 -727
cutler/model_zoo/configs/Base-RCNN-FPN.yaml +0 -42
cutler/model_zoo/configs/COCO-Semisupervised/cascade_mask_rcnn_R_50_FPN_100perc.yaml +0 -40
cutler/model_zoo/configs/COCO-Semisupervised/cascade_mask_rcnn_R_50_FPN_10perc.yaml +0 -40
cutler/model_zoo/configs/COCO-Semisupervised/cascade_mask_rcnn_R_50_FPN_1perc.yaml +0 -42
cutler/model_zoo/configs/COCO-Semisupervised/cascade_mask_rcnn_R_50_FPN_20perc.yaml +0 -40
cutler/model_zoo/configs/COCO-Semisupervised/cascade_mask_rcnn_R_50_FPN_2perc.yaml +0 -42
cutler/model_zoo/configs/COCO-Semisupervised/cascade_mask_rcnn_R_50_FPN_30perc.yaml +0 -40
cutler/model_zoo/configs/COCO-Semisupervised/cascade_mask_rcnn_R_50_FPN_40perc.yaml +0 -40
cutler/model_zoo/configs/COCO-Semisupervised/cascade_mask_rcnn_R_50_FPN_50perc.yaml +0 -40
cutler/model_zoo/configs/COCO-Semisupervised/cascade_mask_rcnn_R_50_FPN_5perc.yaml +0 -42
cutler/model_zoo/configs/COCO-Semisupervised/cascade_mask_rcnn_R_50_FPN_60perc.yaml +0 -40
cutler/model_zoo/configs/COCO-Semisupervised/cascade_mask_rcnn_R_50_FPN_80perc.yaml +0 -40
cutler/model_zoo/configs/CutLER-ImageNet/cascade_mask_rcnn_R_50_FPN.yaml +0 -61
cutler/model_zoo/configs/CutLER-ImageNet/cascade_mask_rcnn_R_50_FPN_demo.yaml +0 -62
cutler/model_zoo/configs/CutLER-ImageNet/cascade_mask_rcnn_R_50_FPN_self_train.yaml +0 -60
cutler/model_zoo/configs/CutLER-ImageNet/mask_rcnn_R_50_FPN.yaml +0 -52
cutler/modeling/__init__.py +0 -16
cutler/modeling/meta_arch/__init__.py +0 -7
cutler/modeling/meta_arch/build.py +0 -27
cutler/modeling/meta_arch/rcnn.py +0 -344
cutler/modeling/roi_heads/__init__.py +0 -16
cutler/modeling/roi_heads/custom_cascade_rcnn.py +0 -338
cutler/modeling/roi_heads/fast_rcnn.py +0 -587
cutler/modeling/roi_heads/roi_heads.py +0 -926
cutler/solver/__init__.py +0 -5

INSTALL.md DELETED Viewed

@@ -1,34 +0,0 @@
-# Installation
-## Requirements
-- Linux or macOS with Python ≥ 3.8
-- PyTorch ≥ 1.8 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation.
-  Install them together at [pytorch.org](https://pytorch.org) to make sure of this.
-  Note, please check PyTorch version matches that is required by Detectron2.
-- Detectron2: follow Detectron2 installation instructions.
-- OpenCV ≥ 4.6 is needed by demo and visualization.
-## Example conda environment setup
-```bash
-conda create --name cutler python=3.8 -y
-conda activate cutler
-conda install pytorch==1.8.1 torchvision==0.9.1 torchaudio==0.8.1 -c pytorch
-pip install git+https://github.com/lucasb-eyer/pydensecrf.git
-# under your working directory
-git clone git@github.com:facebookresearch/detectron2.git
-cd detectron2
-pip install -e .
-pip install git+https://github.com/cocodataset/panopticapi.git
-pip install git+https://github.com/mcordts/cityscapesScripts.git
-cd ..
-git clone --recursive git@github.com:facebookresearch/CutLER.git
-cd CutLER
-pip install -r requirements.txt
-```
-## datasets
-If you want to train/evaluate on the datasets, please see [datasets/README.md](datasets/README.md) to see how we prepare datasets for this project.

README.md DELETED Viewed

@@ -1,405 +0,0 @@
-# Cut and Learn for Unsupervised Image & Video Object Detection and Instance Segmentation
-**Cut**-and-**LE**a**R**n (**CutLER**) is a simple approach for training object detection and instance segmentation models without human annotations.
-It outperforms previous SOTA by **2.7 times** for AP50 and **2.6 times** for AR on **11 benchmarks**.
-<p align="center"> <img src='docs/teaser_img.jpg' align="center" > </p>
-> [**Cut and Learn for Unsupervised Object Detection and Instance Segmentation**](http://people.eecs.berkeley.edu/~xdwang/projects/CutLER/)
-> [Xudong Wang](https://people.eecs.berkeley.edu/~xdwang/), [Rohit Girdhar](https://rohitgirdhar.github.io/), [Stella X. Yu](https://www1.icsi.berkeley.edu/~stellayu/), [Ishan Misra](https://imisra.github.io/)
-> FAIR, Meta AI; UC Berkeley
-> CVPR 2023
-[[`project page`](http://people.eecs.berkeley.edu/~xdwang/projects/CutLER/)] [[`arxiv`](https://arxiv.org/abs/2301.11320)] [[`colab`](https://colab.research.google.com/drive/1NgEyFHvOfuA2MZZnfNPWg1w5gSr3HOBb?usp=sharing)] [[`bibtex`](#citation)]
-Unsupervised video instance segmentation (**VideoCutLER**) is also supported. ***We demonstrate that video instance segmentation models can be learned without using any human annotations, without relying on natural videos (ImageNet data alone is sufficient), and even without motion estimations!*** The code is available [here](videocutler).
-<p align="center">
-  <img src="docs/demos_videocutler.gif" width=100%>
-</p>
-> [**VideoCutLER: Surprisingly Simple Unsupervised Video Instance Segmentation**](https://people.eecs.berkeley.edu/~xdwang/projects/VideoCutLER/videocutler.pdf)
-> [Xudong Wang](https://people.eecs.berkeley.edu/~xdwang/), [Ishan Misra](https://imisra.github.io/), Ziyun Zeng, [Rohit Girdhar](https://rohitgirdhar.github.io/), [Trevor Darrell](https://people.eecs.berkeley.edu/~trevor/)
-> UC Berkeley; FAIR, Meta AI
-> CVPR 2024
-[[`code`](videocutler/README.md)] [[`PDF`](https://people.eecs.berkeley.edu/~xdwang/projects/VideoCutLER/videocutler.pdf)] [[`arxiv`](https://arxiv.org/abs/2308.14710)] [[`bibtex`](#citation)]
-## Features
-- We propose MaskCut approach to generate pseudo-masks for multiple objects in an image.
-- CutLER can learn unsupervised object detectors and instance segmentors solely on ImageNet-1K.
-- CutLER exhibits strong robustness to domain shifts when evaluated on 11 different benchmarks across domains like natural images, video frames, paintings, sketches, etc.
-- CutLER can serve as a pretrained model for fully/semi-supervised detection and segmentation tasks.
-- We also propose VideoCutLER, a surprisingly simple unsupervised video instance segmentation (UVIS) method without relying on optical flows. ImaegNet-1K is all we need for training a SOTA UVIS model!
-## Installation
-See [installation instructions](INSTALL.md).
-## Dataset Preparation
-See [Preparing Datasets for CutLER](datasets/README.md).
-## Method Overview
-<p align="center">
-  <img src="docs/pipeline.jpg" width=55%>
-</p>
-Cut-and-Learn has two stages: 1) generating pseudo-masks with MaskCut and 2) learning unsupervised detectors from pseudo-masks of unlabeled data.
-### 1. MaskCut
-MaskCut can be used to provide segmentation masks for multiple instances of each image.
-<p align="center">
-  <img src="docs/maskcut.gif" width=100%>
-</p>
-### MaskCut Demo
-Try out the MaskCut demo using Colab (no GPU needed): [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1X05lKL_IBRvZB7q6n6pb4w00_tIYjGlf?usp=sharing)
-Try out the web demo: [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/facebook/MaskCut) (thanks to [@hysts](https://github.com/hysts)!)
-If you want to run MaskCut locally, we provide `demo.py` that is able to visualize the pseudo-masks produced by MaskCut.
-Run it with:
-```
-cd maskcut
-python demo.py --img-path imgs/demo2.jpg \
-  --N 3 --tau 0.15 --vit-arch base --patch-size 8 \
-  [--other-options]
-```
-We give a few demo images in maskcut/imgs/. If you want to run demo.py with cpu, simply add "--cpu" when running the demo script.
-For imgs/demo4.jpg, you need to use "--N 6" to segment all six instances in the image.
-Following, we give some visualizations of the pseudo-masks on the demo images.
-<p align="center">
-  <img src="docs/maskcut-demo.jpg" width=100%>
-</p>
-### Generating Annotations for ImageNet-1K with MaskCut
-To generate pseudo-masks for ImageNet-1K using MaskCut, first set up the ImageNet-1K dataset according to the instructions in [datasets/README.md](datasets/README.md), then execute the following command:
-```
-cd maskcut
-python maskcut.py \
---vit-arch base --patch-size 8 \
---tau 0.15 --fixed_size 480 --N 3 \
---num-folder-per-job 1000 --job-index 0 \
---dataset-path /path/to/dataset/traindir \
---out-dir /path/to/save/annotations \
-```
-As the process of generating pseudo-masks for all 1.3 million images in 1,000 folders takes a significant amount of time, it is recommended to use multiple runs. Each run should process the pseudo-mask generation for a smaller number of image folders by setting "--num-folder-per-job" and "--job-index". Once all runs are completed, you can merge all the resulting json files by using the following command:
-```
-python merge_jsons.py \
---base-dir /path/to/save/annotations \
---num-folder-per-job 2 --fixed-size 480 \
---tau 0.15 --N 3 \
---save-path imagenet_train_fixsize480_tau0.15_N3.json
-```
-The "--num-folder-per-job", "--fixed-size", "--tau" and "--N" of merge_jsons.py should match the ones used to run maskcut.py.
-We also provide a submitit script to launch the pseudo-mask generation process with multiple nodes.
-```
-cd maskcut
-bash run_maskcut_with_submitit.sh
-```
-After that, you can use "merge_jsons.py" to merge all these json files as described above.
-### 2. CutLER
-### Inference Demo for CutLER with Pre-trained Models
-Try out the CutLER demo using Colab (no GPU needed): [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1NgEyFHvOfuA2MZZnfNPWg1w5gSr3HOBb?usp=sharing)
-Try out the web demo: [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/facebook/CutLER) (thanks to [@hysts](https://github.com/hysts)!)
-Try out Replicate demo and the API: [![Replicate](https://replicate.com/cjwbw/cutler/badge)](https://replicate.com/cjwbw/cutler)
-If you want to run CutLER demos locally,
-1. Pick a model and its config file from [model zoo](#model-zoo),
-  for example, `model_zoo/configs/CutLER-ImageNet/cascade_mask_rcnn_R_50_FPN.yaml`.
-2. We provide `demo.py` that is able to demo builtin configs. Run it with:
-```
-cd cutler
-python demo/demo.py --config-file model_zoo/configs/CutLER-ImageNet/cascade_mask_rcnn_R_50_FPN_demo.yaml \
-  --input demo/imgs/*.jpg \
-  [--other-options]
-  --opts MODEL.WEIGHTS /path/to/cutler_w_cascade_checkpoint
-```
-The configs are made for training, therefore we need to specify `MODEL.WEIGHTS` to a model from model zoo for evaluation.
-This command will run the inference and show visualizations in an OpenCV window.
-<!-- For details of the command line arguments, see `demo.py -h` or look at its source code
-to understand its behavior. Some common arguments are: -->
-* To run __on cpu__, add `MODEL.DEVICE cpu` after `--opts`.
-* To save outputs to a directory (for images) or a file (for webcam or video), use `--output`.
-Following, we give some visualizations of the model predictions on the demo images.
-<p align="center">
-  <img src="docs/cutler-demo.jpg" width=100%>
-</p>
-### Unsupervised Model Learning
-Before training the detector, it is necessary to use MaskCut to generate pseudo-masks for all ImageNet data.
-You can either use the pre-generated json file directly by downloading it from [here](http://dl.fbaipublicfiles.com/cutler/maskcut/imagenet_train_fixsize480_tau0.15_N3.json) and placing it under "DETECTRON2_DATASETS/imagenet/annotations/", or generate your own pseudo-masks by following the instructions in [MaskCut](#1-maskcut).
-We provide a script `train_net.py`, that is made to train all the configs provided in CutLER.
-To train a model with "train_net.py", first setup the ImageNet-1K dataset following [datasets/README.md](datasets/README.md), then run:
-```
-cd cutler
-export DETECTRON2_DATASETS=/path/to/DETECTRON2_DATASETS/
-python train_net.py --num-gpus 8 \
-  --config-file model_zoo/configs/CutLER-ImageNet/cascade_mask_rcnn_R_50_FPN.yaml
-```
-If you want to train a model using multiple nodes, you may need to adjust [some model parameters](https://arxiv.org/abs/1706.02677) and some SBATCH command options in "tools/train-1node.sh" and "tools/single-node_run.sh", then run:
-```
-cd cutler
-sbatch tools/train-1node.sh \
-  --config-file model_zoo/configs/CutLER-ImageNet/cascade_mask_rcnn_R_50_FPN.yaml \
-  MODEL.WEIGHTS /path/to/dino/d2format/model \
-  OUTPUT_DIR output/
-```
-You can also convert a pre-trained DINO model to detectron2's format by yourself following [this link](https://github.com/facebookresearch/moco/tree/main/detection).
-### Self-training
-We further improve performance by self-training the model on its predictions.
-Firstly, we can get model predictions on ImageNet via running:
-```
-python train_net.py --num-gpus 8 \
-  --config-file model_zoo/configs/CutLER-ImageNet/cascade_mask_rcnn_R_50_FPN.yaml \
-  --test-dataset imagenet_train \
-  --eval-only TEST.DETECTIONS_PER_IMAGE 30 \
-  MODEL.WEIGHTS output/model_final.pth \ # load previous stage/round checkpoints
-  OUTPUT_DIR output/ # path to save model predictions
-```
-Secondly, we can run the following command to generate the json file for the first round of self-training:
-```
-python tools/get_self_training_ann.py \
-  --new-pred output/inference/coco_instances_results.json \ # load model predictions
-  --prev-ann DETECTRON2_DATASETS/imagenet/annotations/imagenet_train_fixsize480_tau0.15_N3.json \ # path to the old annotation file.
-  --save-path DETECTRON2_DATASETS/imagenet/annotations/cutler_imagenet1k_train_r1.json \ # path to save a new annotation file.
-  --threshold 0.7
-```
-Finally, place "cutler_imagenet1k_train_r1.json" under "DETECTRON2_DATASETS/imagenet/annotations/", then launch the self-training process:
-```
-python train_net.py --num-gpus 8 \
-  --config-file model_zoo/configs/CutLER-ImageNet/cascade_mask_rcnn_R_50_FPN_self_train.yaml \
-  --train-dataset imagenet_train_r1 \
-  MODEL.WEIGHTS output/model_final.pth \ # load previous stage/round checkpoints
-  OUTPUT_DIR output/self-train-r1/ # path to save checkpoints
-```
-You can repeat the steps above to perform multiple rounds of self-training and adjust some arguments as needed (e.g., "--threshold" for round 1 and 2 can be set to 0.7 and 0.65, respectively; "--train-dataset" for round 1 and 2 can be set to "imagenet_train_r1" and "imagenet_train_r2", respectively; MODEL.WEIGHTS for round 1 and 2 should point to the previous stage/round checkpoints). Ensure that all annotation files are placed under DETECTRON2_DATASETS/imagenet/annotations/.
-Please ensure that "--train-dataset", json file names and locations match the ones specified in "cutler/data/datasets/builtin.py".
-Please refer to this [instruction](https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html) for guidance on using custom datasets.
-You can also directly download the MODEL.WEIGHTS and annotations used for each round of self-training:
-<table><tbody>
-<!-- START TABLE -->
-<!-- TABLE BODY -->
-<!-- ROW: round 1 -->
-<tr><td align="center">round 1</td>
-<td align="center"><a href="http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_cascade_r1.pth">cutler_cascade_r1.pth</a></td>
-<td align="center"><a href="http://dl.fbaipublicfiles.com/cutler/maskcut/cutler_imagenet1k_train_r1.json">cutler_imagenet1k_train_r1.json</a></td>
-</tr>
-<!-- ROW: round 2 -->
-<tr><td align="center">round 2</td>
-<td align="center"><a href="http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_cascade_r2.pth">cutler_cascade_r2.pth</a></td>
-<td align="center"><a href="http://dl.fbaipublicfiles.com/cutler/maskcut/cutler_imagenet1k_train_r2.json">cutler_imagenet1k_train_r2.json</a></td>
-</tr>
-</tbody></table>
-### Unsupervised Zero-shot Evaluation
-To evaluate a model's performance on 11 different datasets, please refer to [datasets/README.md](datasets/README.md) for instructions on preparing the datasets. Next, select a model from the model zoo, specify the "model_weights", "config_file" and the path to "DETECTRON2_DATASETS" in `tools/eval.sh`, then run the script.
-```
-bash tools/eval.sh
-```
-### Model Zoo
-We show zero-shot unsupervised object detection performance (AP50&nbsp;|&nbsp;AR) on 11 different datasets spanning a variety of domains. ^: CutLER using Mask R-CNN as a detector; *: CutLER using Cascade Mask R-CNN as a detector.
-<table><tbody>
-<!-- START TABLE -->
-<!-- TABLE HEADER -->
-<th valign="bottom">Methods</th>
-<th valign="bottom">Models</th>
-<th valign="bottom">COCO</th>
-<th valign="bottom">COCO20K</th>
-<th valign="bottom">VOC</th>
-<th valign="bottom">LVIS</th>
-<th valign="bottom">UVO</th>
-<th valign="bottom">Clipart</th>
-<th valign="bottom">Comic</th>
-<th valign="bottom">Watercolor</th>
-<th valign="bottom">KITTI</th>
-<th valign="bottom">Objects365</th>
-<th valign="bottom">OpenImages</th>
-<!-- TABLE BODY -->
-</tr>
-<tr><td align="center">Prev. SOTA</td>
-<td valign="bottom">-</td>
-<td align="center">9.6&nbsp;|&nbsp;12.6</td>
-<td align="center">9.7&nbsp;|&nbsp;12.6</td>
-<td align="center">15.9&nbsp;|&nbsp;21.3</td>
-<td align="center">3.8&nbsp;|&nbsp;6.4</td>
-<td align="center">10.0&nbsp;|&nbsp;14.2</td>
-<td align="center">7.9&nbsp;|&nbsp;15.1</td>
-<td align="center">9.9&nbsp;|&nbsp;16.3</td>
-<td align="center">6.7&nbsp;|&nbsp;16.2</td>
-<td align="center">7.7&nbsp;|&nbsp;7.1</td>
-<td align="center">8.1&nbsp;|&nbsp;10.2</td>
-<td align="center">9.9&nbsp;|&nbsp;14.9</td>
-</tr>
-<!-- ROW: Box/Mask AP for CutLER -->
-</tr>
-<tr><td align="center">CutLER^</td>
-<td valign="bottom"><a href="http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_mrcnn_final.pth">download</a></td>
-<td align="center">21.1&nbsp;|&nbsp;29.6</td>
-<td align="center">21.6&nbsp;|&nbsp;30.0</td>
-<td align="center">36.6&nbsp;|&nbsp;41.0</td>
-<td align="center">7.7&nbsp;|&nbsp;18.7</td>
-<td align="center">29.8&nbsp;|&nbsp;38.4</td>
-<td align="center">20.9&nbsp;|&nbsp;38.5</td>
-<td align="center">31.2&nbsp;|&nbsp;37.1</td>
-<td align="center">37.3&nbsp;|&nbsp;39.9</td>
-<td align="center">15.3&nbsp;|&nbsp;25.4</td>
-<td align="center">19.5&nbsp;|&nbsp;30.0</td>
-<td align="center">17.1&nbsp;|&nbsp;26.4</td>
-</tr>
-<!-- ROW: Box/Mask AP for CutLER -->
-</tr>
-<tr><td align="center">CutLER*</td>
-<td valign="bottom"><a href="http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_cascade_final.pth">download</a></td>
-<td align="center">21.9&nbsp;|&nbsp;32.7</td>
-<td align="center">22.4&nbsp;|&nbsp;33.1</td>
-<td align="center">36.9&nbsp;|&nbsp;44.3</td>
-<td align="center">8.4&nbsp;|&nbsp;21.8</td>
-<td align="center">31.7&nbsp;|&nbsp;42.8</td>
-<td align="center">21.1&nbsp;|&nbsp;41.3</td>
-<td align="center">30.4&nbsp;|&nbsp;38.6</td>
-<td align="center">37.5&nbsp;|&nbsp;44.6</td>
-<td align="center">18.4&nbsp;|&nbsp;27.5</td>
-<td align="center">21.6&nbsp;|&nbsp;34.2</td>
-<td align="center">17.3&nbsp;|&nbsp;29.6</td>
-</tr>
-</tbody></table>
-## Semi-supervised and Fully-supervised Learning
-CutLER can also serve as a pretrained model for training fully supervised object detection and instance segmentation models and improves performance on COCO, including on few-shot benchmarks.
-### Training & Evaluation in Command Line
-You can find all the semi-supervised and fully-supervised learning configs provided in CutLER under `model_zoo/configs/COCO-Semisupervised`.
-To train a model using K% labels with `train_net.py`, first set up the COCO dataset according to [datasets/README.md](datasets/README.md) and specify K value in the config file, then run:
-```
-python train_net.py --num-gpus 8 \
-  --config-file model_zoo/configs/COCO-Semisupervised/cascade_mask_rcnn_R_50_FPN_{K}perc.yaml \
-  MODEL.WEIGHTS /path/to/cutler_pretrained_model
-```
-You can find all config files used to train supervised models under `model_zoo/configs/COCO-Semisupervised`.
-The configs are made for 8-GPU training. To train on 1 GPU, you may need to [change some parameters](https://arxiv.org/abs/1706.02677), e.g. number of GPUs (num-gpus your_num_gpus), learning rates (SOLVER.BASE_LR your_base_lr) and batch size (SOLVER.IMS_PER_BATCH your_batch_size).
-### Evaluation
-To evaluate a model's performance, use
-```
-python train_net.py \
-  --config-file model_zoo/configs/COCO-Semisupervised/cascade_mask_rcnn_R_50_FPN_{K}perc.yaml \
-  --eval-only MODEL.WEIGHTS /path/to/checkpoint_file
-```
-For more options, see `python train_net.py -h`.
-### Model Zoo
-We fine-tune a Cascade R-CNN model initialized with CutLER or MoCo-v2 on varying amounts of labeled COCO data, and show results (Box&nbsp;|&nbsp;Mask AP) on the val2017 split below:
-<table><tbody>
-<!-- START TABLE -->
-<!-- TABLE HEADER -->
-<th valign="bottom">% of labels</th>
-<th valign="bottom">1%</th>
-<th valign="bottom">2%</th>
-<th valign="bottom">5%</th>
-<th valign="bottom">10%</th>
-<th valign="bottom">20%</th>
-<th valign="bottom">30%</th>
-<th valign="bottom">40%</th>
-<th valign="bottom">50%</th>
-<th valign="bottom">60%</th>
-<th valign="bottom">80%</th>
-<th valign="bottom">100%</th>
-<!-- TABLE BODY -->
-<!-- ROW: Box/Mask AP for CutLER -->
-<tr><td align="center">MoCo-v2</td>
-<td align="center">11.8&nbsp;|&nbsp;10.0</td>
-<td align="center">16.2&nbsp;|&nbsp;13.8</td>
-<td align="center">20.5&nbsp;|&nbsp;17.8</td>
-<td align="center">26.5&nbsp;|&nbsp;23.0</td>
-<td align="center">32.5&nbsp;|&nbsp;28.2</td>
-<td align="center">35.5&nbsp;|&nbsp;30.8</td>
-<td align="center">37.3&nbsp;|&nbsp;32.3</td>
-<td align="center">38.7&nbsp;|&nbsp;33.6</td>
-<td align="center">39.9&nbsp;|&nbsp;34.6</td>
-<td align="center">41.6&nbsp;|&nbsp;36.0</td>
-<td align="center">42.8&nbsp;|&nbsp;37.0</td>
-</tr>
-<!-- ROW: Mask AP -->
-<tr><td align="center">CutLER</td>
-<td align="center">16.8&nbsp;|&nbsp;14.6</td>
-<td align="center">21.6&nbsp;|&nbsp;18.9</td>
-<td align="center">27.8&nbsp;|&nbsp;24.3</td>
-<td align="center">32.2&nbsp;|&nbsp;28.1</td>
-<td align="center">36.6&nbsp;|&nbsp;31.7</td>
-<td align="center">38.2&nbsp;|&nbsp;33.3</td>
-<td align="center">39.9&nbsp;|&nbsp;34.7</td>
-<td align="center">41.5&nbsp;|&nbsp;35.9</td>
-<td align="center">42.3&nbsp;|&nbsp;36.7</td>
-<td align="center">43.8&nbsp;|&nbsp;37.9</td>
-<td align="center">44.7&nbsp;|&nbsp;38.5</td>
-</tr>
-<!-- ROW: Model Downloads -->
-<tr><td align="center">Download</td>
-<td align="center"><a href="http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_semi_1perc.pth">model</a></td>
-<td align="center"><a href="http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_semi_2perc.pth">model</a></td>
-<td align="center"><a href="http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_semi_5perc.pth">model</a></td>
-<td align="center"><a href="http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_semi_10perc.pth">model</a></td>
-<td align="center"><a href="http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_semi_20perc.pth">model</a></td>
-<td align="center"><a href="http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_semi_30perc.pth">model</a></td>
-<td align="center"><a href="http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_semi_40perc.pth">model</a></td>
-<td align="center"><a href="http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_semi_50perc.pth">model</a></td>
-<td align="center"><a href="http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_semi_60perc.pth">model</a></td>
-<td align="center"><a href="http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_semi_80perc.pth">model</a></td>
-<td align="center"><a href="http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_fully_100perc.pth">model</a></td>
-</tr>
-</tbody></table>
-Both MoCo-v2 and our CutLER are trained for the 1x schedule using Detectron2, except for extremely low-shot settings with 1% or 2% labels. When training with 1% or 2% labels, we train both MoCo-v2 and our model for 3,600 iterations with a batch size of 16.
-## License
-The majority of CutLER, Detectron2 and DINO are licensed under the [CC-BY-NC license](LICENSE), however portions of the project are available under separate license terms: TokenCut, Bilateral Solver and CRF are licensed under the MIT license; If you later add other third party code, please keep this license info updated, and please let us know if that component is licensed under something other than CC-BY-NC, MIT, or CC0.
-## Ethical Considerations
-CutLER's wide range of detection capabilities may introduce similar challenges to many other visual recognition methods.
-As the image can contain arbitrary instances, it may impact the model output.
-## How to get support from us?
-If you have any general questions, feel free to email us at [Xudong Wang](mailto:xdwang@eecs.berkeley.edu), [Ishan Misra](mailto:imisra@meta.com) and [Rohit Girdhar](mailto:rgirdhar@meta.com). If you have code or implementation-related questions, please feel free to send emails to us or open an issue in this codebase (We recommend that you open an issue in this codebase, because your questions may help others).
-## Citation
-If you find our work inspiring or use our codebase in your research, please consider giving a star ⭐ and a citation.
-```
-@inproceedings{wang2023cut,
-  title={Cut and learn for unsupervised object detection and instance segmentation},
-  author={Wang, Xudong and Girdhar, Rohit and Yu, Stella X and Misra, Ishan},
-  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
-  pages={3124--3134},
-  year={2023}
-}
-```
-```
-@article{wang2023videocutler,
-  title={VideoCutLER: Surprisingly Simple Unsupervised Video Instance Segmentation},
-  author={Wang, Xudong and Misra, Ishan and Zeng, Ziyun and Girdhar, Rohit and Darrell, Trevor},
-  journal={arXiv preprint arXiv:2308.14710},
-  year={2023}
-}
-```

cog.yaml DELETED Viewed

@@ -1,24 +0,0 @@
-build:
-  gpu: true
-  cuda: "11.6"
-  python_version: "3.8"
-  python_packages:
-    - "torch==1.11.0"
-    - "torchvision==0.12.0"
-    - "faiss-gpu==1.7.2"
-    - "opencv-python==4.6.0.66"
-    - "scikit-image==0.19.2"
-    - "scikit-learn==1.1.1"
-    - "shapely==1.8.2"
-    - "timm==0.5.4"
-    - "pyyaml==6.0"
-    - "colored==1.4.4"
-    - "fvcore==0.1.5.post20220512"
-    - "gdown==4.5.4"
-    - "pycocotools==2.0.6"
-    - "numpy==1.20.0"
-  run:
-    - pip install git+https://github.com/lucasb-eyer/pydensecrf.git
-predict: "maskcut/predict.py:Predictor"

cutler/__init__.py DELETED Viewed

@@ -1,15 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-import config
-import engine
-import modeling
-import structures
-import tools
-import demo
-# dataset loading
-from . import data  # register all new datasets
-from data import datasets  # register all new datasets
-from solver import *
-# from .data import register_all_imagenet

cutler/config/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-from .cutler_config import add_cutler_config

cutler/config/cutler_config.py DELETED Viewed

@@ -1,19 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-from detectron2.config import CfgNode as CN
-def add_cutler_config(cfg):
-    cfg.DATALOADER.COPY_PASTE = False
-    cfg.DATALOADER.COPY_PASTE_RATE = 0.0
-    cfg.DATALOADER.COPY_PASTE_MIN_RATIO = 0.5
-    cfg.DATALOADER.COPY_PASTE_MAX_RATIO = 1.0
-    cfg.DATALOADER.COPY_PASTE_RANDOM_NUM = True
-    cfg.DATALOADER.VISUALIZE_COPY_PASTE = False
-    cfg.MODEL.ROI_HEADS.USE_DROPLOSS = False
-    cfg.MODEL.ROI_HEADS.DROPLOSS_IOU_THRESH = 0.0
-    cfg.SOLVER.BASE_LR_MULTIPLIER = 1
-    cfg.SOLVER.BASE_LR_MULTIPLIER_NAMES = []
-    cfg.TEST.NO_SEGM = False

cutler/data/__init__.py DELETED Viewed

@@ -1,15 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-from . import datasets  # ensure the builtin datasets are registered
-from .detection_utils import *  # isort:skip
-from .build import (
-    build_batch_data_loader,
-    build_detection_train_loader,
-    build_detection_test_loader,
-    get_detection_dataset_dicts,
-    load_proposals_into_dataset,
-    print_instances_class_histogram,
-    )
-from detectron2.data.common import *
-__all__ = [k for k in globals().keys() if not k.startswith("_")]

cutler/data/build.py DELETED Viewed

@@ -1,561 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# Modified by XuDong Wang from https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/build.py
-import itertools
-import logging
-import numpy as np
-import operator
-import pickle
-from typing import Any, Callable, Dict, List, Optional, Union
-import torch
-import torch.utils.data as torchdata
-from tabulate import tabulate
-from termcolor import colored
-from detectron2.config import configurable
-from detectron2.structures import BoxMode
-from detectron2.utils.comm import get_world_size
-from detectron2.utils.env import seed_all_rng
-from detectron2.utils.file_io import PathManager
-from detectron2.utils.logger import _log_api_usage, log_first_n
-from detectron2.data.catalog import DatasetCatalog, MetadataCatalog
-from detectron2.data.common import AspectRatioGroupedDataset, DatasetFromList, MapDataset, ToIterableDataset
-from data.dataset_mapper import DatasetMapper
-from data.detection_utils import check_metadata_consistency
-from detectron2.data.samplers import (
-    InferenceSampler,
-    RandomSubsetTrainingSampler,
-    RepeatFactorTrainingSampler,
-    TrainingSampler,
-)
-"""
-This file contains the default logic to build a dataloader for training or testing.
-"""
-__all__ = [
-    "build_batch_data_loader",
-    "build_detection_train_loader",
-    "build_detection_test_loader",
-    "get_detection_dataset_dicts",
-    "load_proposals_into_dataset",
-    "print_instances_class_histogram",
-]
-def filter_images_with_only_crowd_annotations(dataset_dicts):
-    """
-    Filter out images with none annotations or only crowd annotations
-    (i.e., images without non-crowd annotations).
-    A common training-time preprocessing on COCO dataset.
-    Args:
-        dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
-    Returns:
-        list[dict]: the same format, but filtered.
-    """
-    num_before = len(dataset_dicts)
-    def valid(anns):
-        for ann in anns:
-            if ann.get("iscrowd", 0) == 0:
-                return True
-        return False
-    dataset_dicts = [x for x in dataset_dicts if valid(x["annotations"])]
-    num_after = len(dataset_dicts)
-    logger = logging.getLogger(__name__)
-    logger.info(
-        "Removed {} images with no usable annotations. {} images left.".format(
-            num_before - num_after, num_after
-        )
-    )
-    print("Removed {} images with no usable annotations. {} images left.".format(
-            num_before - num_after, num_after
-        ))
-    return dataset_dicts
-def filter_images_with_few_keypoints(dataset_dicts, min_keypoints_per_image):
-    """
-    Filter out images with too few number of keypoints.
-    Args:
-        dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
-    Returns:
-        list[dict]: the same format as dataset_dicts, but filtered.
-    """
-    num_before = len(dataset_dicts)
-    def visible_keypoints_in_image(dic):
-        # Each keypoints field has the format [x1, y1, v1, ...], where v is visibility
-        annotations = dic["annotations"]
-        return sum(
-            (np.array(ann["keypoints"][2::3]) > 0).sum()
-            for ann in annotations
-            if "keypoints" in ann
-        )
-    dataset_dicts = [
-        x for x in dataset_dicts if visible_keypoints_in_image(x) >= min_keypoints_per_image
-    ]
-    num_after = len(dataset_dicts)
-    logger = logging.getLogger(__name__)
-    logger.info(
-        "Removed {} images with fewer than {} keypoints.".format(
-            num_before - num_after, min_keypoints_per_image
-        )
-    )
-    return dataset_dicts
-def load_proposals_into_dataset(dataset_dicts, proposal_file):
-    """
-    Load precomputed object proposals into the dataset.
-    The proposal file should be a pickled dict with the following keys:
-    - "ids": list[int] or list[str], the image ids
-    - "boxes": list[np.ndarray], each is an Nx4 array of boxes corresponding to the image id
-    - "objectness_logits": list[np.ndarray], each is an N sized array of objectness scores
-      corresponding to the boxes.
-    - "bbox_mode": the BoxMode of the boxes array. Defaults to ``BoxMode.XYXY_ABS``.
-    Args:
-        dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
-        proposal_file (str): file path of pre-computed proposals, in pkl format.
-    Returns:
-        list[dict]: the same format as dataset_dicts, but added proposal field.
-    """
-    logger = logging.getLogger(__name__)
-    logger.info("Loading proposals from: {}".format(proposal_file))
-    with PathManager.open(proposal_file, "rb") as f:
-        proposals = pickle.load(f, encoding="latin1")
-    # Rename the key names in D1 proposal files
-    rename_keys = {"indexes": "ids", "scores": "objectness_logits"}
-    for key in rename_keys:
-        if key in proposals:
-            proposals[rename_keys[key]] = proposals.pop(key)
-    # Fetch the indexes of all proposals that are in the dataset
-    # Convert image_id to str since they could be int.
-    img_ids = set({str(record["image_id"]) for record in dataset_dicts})
-    id_to_index = {str(id): i for i, id in enumerate(proposals["ids"]) if str(id) in img_ids}
-    # Assuming default bbox_mode of precomputed proposals are 'XYXY_ABS'
-    bbox_mode = BoxMode(proposals["bbox_mode"]) if "bbox_mode" in proposals else BoxMode.XYXY_ABS
-    for record in dataset_dicts:
-        # Get the index of the proposal
-        i = id_to_index[str(record["image_id"])]
-        boxes = proposals["boxes"][i]
-        objectness_logits = proposals["objectness_logits"][i]
-        # Sort the proposals in descending order of the scores
-        inds = objectness_logits.argsort()[::-1]
-        record["proposal_boxes"] = boxes[inds]
-        record["proposal_objectness_logits"] = objectness_logits[inds]
-        record["proposal_bbox_mode"] = bbox_mode
-    return dataset_dicts
-def print_instances_class_histogram(dataset_dicts, class_names):
-    """
-    Args:
-        dataset_dicts (list[dict]): list of dataset dicts.
-        class_names (list[str]): list of class names (zero-indexed).
-    """
-    num_classes = len(class_names)
-    hist_bins = np.arange(num_classes + 1)
-    histogram = np.zeros((num_classes,), dtype=np.int)
-    for entry in dataset_dicts:
-        annos = entry["annotations"]
-        classes = np.asarray(
-            [x["category_id"] for x in annos if not x.get("iscrowd", 0)], dtype=np.int
-        )
-        if len(classes):
-            assert classes.min() >= 0, f"Got an invalid category_id={classes.min()}"
-            assert (
-                classes.max() < num_classes
-            ), f"Got an invalid category_id={classes.max()} for a dataset of {num_classes} classes"
-        histogram += np.histogram(classes, bins=hist_bins)[0]
-    N_COLS = min(6, len(class_names) * 2)
-    def short_name(x):
-        # make long class names shorter. useful for lvis
-        if len(x) > 13:
-            return x[:11] + ".."
-        return x
-    data = list(
-        itertools.chain(*[[short_name(class_names[i]), int(v)] for i, v in enumerate(histogram)])
-    )
-    total_num_instances = sum(data[1::2])
-    data.extend([None] * (N_COLS - (len(data) % N_COLS)))
-    if num_classes > 1:
-        data.extend(["total", total_num_instances])
-    data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)])
-    table = tabulate(
-        data,
-        headers=["category", "#instances"] * (N_COLS // 2),
-        tablefmt="pipe",
-        numalign="left",
-        stralign="center",
-    )
-    log_first_n(
-        logging.INFO,
-        "Distribution of instances among all {} categories:\n".format(num_classes)
-        + colored(table, "cyan"),
-        key="message",
-    )
-def get_detection_dataset_dicts(
-    names,
-    filter_empty=True,
-    min_keypoints=0,
-    proposal_files=None,
-    check_consistency=True,
-):
-    """
-    Load and prepare dataset dicts for instance detection/segmentation and semantic segmentation.
-    Args:
-        names (str or list[str]): a dataset name or a list of dataset names
-        filter_empty (bool): whether to filter out images without instance annotations
-        min_keypoints (int): filter out images with fewer keypoints than
-            `min_keypoints`. Set to 0 to do nothing.
-        proposal_files (list[str]): if given, a list of object proposal files
-            that match each dataset in `names`.
-        check_consistency (bool): whether to check if datasets have consistent metadata.
-    Returns:
-        list[dict]: a list of dicts following the standard dataset dict format.
-    """
-    if isinstance(names, str):
-        names = [names]
-    assert len(names), names
-    dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in names]
-    if isinstance(dataset_dicts[0], torchdata.Dataset):
-        if len(dataset_dicts) > 1:
-            # ConcatDataset does not work for iterable style dataset.
-            # We could support concat for iterable as well, but it's often
-            # not a good idea to concat iterables anyway.
-            return torchdata.ConcatDataset(dataset_dicts)
-        return dataset_dicts[0]
-    for dataset_name, dicts in zip(names, dataset_dicts):
-        assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
-    if proposal_files is not None:
-        assert len(names) == len(proposal_files)
-        # load precomputed proposals from proposal files
-        dataset_dicts = [
-            load_proposals_into_dataset(dataset_i_dicts, proposal_file)
-            for dataset_i_dicts, proposal_file in zip(dataset_dicts, proposal_files)
-        ]
-    dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts))
-    has_instances = "annotations" in dataset_dicts[0]
-    if filter_empty and has_instances:
-        dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts)
-    if min_keypoints > 0 and has_instances:
-        dataset_dicts = filter_images_with_few_keypoints(dataset_dicts, min_keypoints)
-    if check_consistency and has_instances:
-        try:
-            class_names = MetadataCatalog.get(names[0]).thing_classes
-            check_metadata_consistency("thing_classes", names)
-            print_instances_class_histogram(dataset_dicts, class_names)
-        except AttributeError:  # class names are not available for this dataset
-            pass
-    assert len(dataset_dicts), "No valid data found in {}.".format(",".join(names))
-    return dataset_dicts
-def build_batch_data_loader(
-    dataset,
-    sampler,
-    total_batch_size,
-    *,
-    aspect_ratio_grouping=False,
-    num_workers=0,
-    collate_fn=None,
-):
-    """
-    Build a batched dataloader. The main differences from `torch.utils.data.DataLoader` are:
-    1. support aspect ratio grouping options
-    2. use no "batch collation", because this is common for detection training
-    Args:
-        dataset (torch.utils.data.Dataset): a pytorch map-style or iterable dataset.
-        sampler (torch.utils.data.sampler.Sampler or None): a sampler that produces indices.
-            Must be provided iff. ``dataset`` is a map-style dataset.
-        total_batch_size, aspect_ratio_grouping, num_workers, collate_fn: see
-            :func:`build_detection_train_loader`.
-    Returns:
-        iterable[list]. Length of each list is the batch size of the current
-            GPU. Each element in the list comes from the dataset.
-    """
-    world_size = get_world_size()
-    assert (
-        total_batch_size > 0 and total_batch_size % world_size == 0
-    ), "Total batch size ({}) must be divisible by the number of gpus ({}).".format(
-        total_batch_size, world_size
-    )
-    batch_size = total_batch_size // world_size
-    if isinstance(dataset, torchdata.IterableDataset):
-        assert sampler is None, "sampler must be None if dataset is IterableDataset"
-    else:
-        dataset = ToIterableDataset(dataset, sampler)
-    if aspect_ratio_grouping:
-        data_loader = torchdata.DataLoader(
-            dataset,
-            num_workers=num_workers,
-            collate_fn=operator.itemgetter(0),  # don't batch, but yield individual elements
-            worker_init_fn=worker_init_reset_seed,
-        )  # yield individual mapped dict
-        data_loader = AspectRatioGroupedDataset(data_loader, batch_size)
-        if collate_fn is None:
-            return data_loader
-        return MapDataset(data_loader, collate_fn)
-    else:
-        return torchdata.DataLoader(
-            dataset,
-            batch_size=batch_size,
-            drop_last=True,
-            num_workers=num_workers,
-            collate_fn=trivial_batch_collator if collate_fn is None else collate_fn,
-            worker_init_fn=worker_init_reset_seed,
-        )
-def _train_loader_from_config(cfg, mapper=None, *, dataset=None, sampler=None):
-    if dataset is None:
-        dataset = get_detection_dataset_dicts(
-            cfg.DATASETS.TRAIN,
-            filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
-            min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
-            if cfg.MODEL.KEYPOINT_ON
-            else 0,
-            proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
-        )
-        _log_api_usage("dataset." + cfg.DATASETS.TRAIN[0])
-    if mapper is None:
-        mapper = DatasetMapper(cfg, True)
-    if sampler is None:
-        sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
-        logger = logging.getLogger(__name__)
-        if isinstance(dataset, torchdata.IterableDataset):
-            logger.info("Not using any sampler since the dataset is IterableDataset.")
-            sampler = None
-        else:
-            logger.info("Using training sampler {}".format(sampler_name))
-            if sampler_name == "TrainingSampler":
-                sampler = TrainingSampler(len(dataset))
-            elif sampler_name == "RepeatFactorTrainingSampler":
-                repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency(
-                    dataset, cfg.DATALOADER.REPEAT_THRESHOLD
-                )
-                sampler = RepeatFactorTrainingSampler(repeat_factors)
-            elif sampler_name == "RandomSubsetTrainingSampler":
-                sampler = RandomSubsetTrainingSampler(
-                    len(dataset), cfg.DATALOADER.RANDOM_SUBSET_RATIO
-                )
-            else:
-                raise ValueError("Unknown training sampler: {}".format(sampler_name))
-    return {
-        "dataset": dataset,
-        "sampler": sampler,
-        "mapper": mapper,
-        "total_batch_size": cfg.SOLVER.IMS_PER_BATCH,
-        "aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING,
-        "num_workers": cfg.DATALOADER.NUM_WORKERS,
-    }
-@configurable(from_config=_train_loader_from_config)
-def build_detection_train_loader(
-    dataset,
-    *,
-    mapper,
-    sampler=None,
-    total_batch_size,
-    aspect_ratio_grouping=True,
-    num_workers=0,
-    collate_fn=None,
-):
-    """
-    Build a dataloader for object detection with some default features.
-    Args:
-        dataset (list or torch.utils.data.Dataset): a list of dataset dicts,
-            or a pytorch dataset (either map-style or iterable). It can be obtained
-            by using :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
-        mapper (callable): a callable which takes a sample (dict) from dataset and
-            returns the format to be consumed by the model.
-            When using cfg, the default choice is ``DatasetMapper(cfg, is_train=True)``.
-        sampler (torch.utils.data.sampler.Sampler or None): a sampler that produces
-            indices to be applied on ``dataset``.
-            If ``dataset`` is map-style, the default sampler is a :class:`TrainingSampler`,
-            which coordinates an infinite random shuffle sequence across all workers.
-            Sampler must be None if ``dataset`` is iterable.
-        total_batch_size (int): total batch size across all workers.
-        aspect_ratio_grouping (bool): whether to group images with similar
-            aspect ratio for efficiency. When enabled, it requires each
-            element in dataset be a dict with keys "width" and "height".
-        num_workers (int): number of parallel data loading workers
-        collate_fn: a function that determines how to do batching, same as the argument of
-            `torch.utils.data.DataLoader`. Defaults to do no collation and return a list of
-            data. No collation is OK for small batch size and simple data structures.
-            If your batch size is large and each sample contains too many small tensors,
-            it's more efficient to collate them in data loader.
-    Returns:
-        torch.utils.data.DataLoader:
-            a dataloader. Each output from it is a ``list[mapped_element]`` of length
-            ``total_batch_size / num_workers``, where ``mapped_element`` is produced
-            by the ``mapper``.
-    """
-    if isinstance(dataset, list):
-        dataset = DatasetFromList(dataset, copy=False)
-    if mapper is not None:
-        dataset = MapDataset(dataset, mapper)
-    if isinstance(dataset, torchdata.IterableDataset):
-        assert sampler is None, "sampler must be None if dataset is IterableDataset"
-    else:
-        if sampler is None:
-            sampler = TrainingSampler(len(dataset))
-        assert isinstance(sampler, torchdata.Sampler), f"Expect a Sampler but got {type(sampler)}"
-    return build_batch_data_loader(
-        dataset,
-        sampler,
-        total_batch_size,
-        aspect_ratio_grouping=aspect_ratio_grouping,
-        num_workers=num_workers,
-        collate_fn=collate_fn,
-    )
-def _test_loader_from_config(cfg, dataset_name, mapper=None):
-    """
-    Uses the given `dataset_name` argument (instead of the names in cfg), because the
-    standard practice is to evaluate each test set individually (not combining them).
-    """
-    if isinstance(dataset_name, str):
-        dataset_name = [dataset_name]
-    dataset = get_detection_dataset_dicts(
-        dataset_name,
-        filter_empty=False,
-        proposal_files=[
-            cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(x)] for x in dataset_name
-        ]
-        if cfg.MODEL.LOAD_PROPOSALS
-        else None,
-    )
-    if mapper is None:
-        mapper = DatasetMapper(cfg, False)
-    return {
-        "dataset": dataset,
-        "mapper": mapper,
-        "num_workers": cfg.DATALOADER.NUM_WORKERS,
-        "sampler": InferenceSampler(len(dataset))
-        if not isinstance(dataset, torchdata.IterableDataset)
-        else None,
-    }
-@configurable(from_config=_test_loader_from_config)
-def build_detection_test_loader(
-    dataset: Union[List[Any], torchdata.Dataset],
-    *,
-    mapper: Callable[[Dict[str, Any]], Any],
-    sampler: Optional[torchdata.Sampler] = None,
-    batch_size: int = 1,
-    num_workers: int = 0,
-    collate_fn: Optional[Callable[[List[Any]], Any]] = None,
-) -> torchdata.DataLoader:
-    """
-    Similar to `build_detection_train_loader`, with default batch size = 1,
-    and sampler = :class:`InferenceSampler`. This sampler coordinates all workers
-    to produce the exact set of all samples.
-    Args:
-        dataset: a list of dataset dicts,
-            or a pytorch dataset (either map-style or iterable). They can be obtained
-            by using :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
-        mapper: a callable which takes a sample (dict) from dataset
-           and returns the format to be consumed by the model.
-           When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``.
-        sampler: a sampler that produces
-            indices to be applied on ``dataset``. Default to :class:`InferenceSampler`,
-            which splits the dataset across all workers. Sampler must be None
-            if `dataset` is iterable.
-        batch_size: the batch size of the data loader to be created.
-            Default to 1 image per worker since this is the standard when reporting
-            inference time in papers.
-        num_workers: number of parallel data loading workers
-        collate_fn: same as the argument of `torch.utils.data.DataLoader`.
-            Defaults to do no collation and return a list of data.
-    Returns:
-        DataLoader: a torch DataLoader, that loads the given detection
-        dataset, with test-time transformation and batching.
-    Examples:
-    ::
-        data_loader = build_detection_test_loader(
-            DatasetRegistry.get("my_test"),
-            mapper=DatasetMapper(...))
-        # or, instantiate with a CfgNode:
-        data_loader = build_detection_test_loader(cfg, "my_test")
-    """
-    if isinstance(dataset, list):
-        dataset = DatasetFromList(dataset, copy=False)
-    if mapper is not None:
-        dataset = MapDataset(dataset, mapper)
-    if isinstance(dataset, torchdata.IterableDataset):
-        assert sampler is None, "sampler must be None if dataset is IterableDataset"
-    else:
-        if sampler is None:
-            sampler = InferenceSampler(len(dataset))
-    return torchdata.DataLoader(
-        dataset,
-        batch_size=batch_size,
-        sampler=sampler,
-        drop_last=False,
-        num_workers=num_workers,
-        collate_fn=trivial_batch_collator if collate_fn is None else collate_fn,
-    )
-def trivial_batch_collator(batch):
-    """
-    A batch collator that does nothing.
-    """
-    return batch
-def worker_init_reset_seed(worker_id):
-    initial_seed = torch.initial_seed() % 2**31
-    seed_all_rng(initial_seed + worker_id)

cutler/data/dataset_mapper.py DELETED Viewed

@@ -1,193 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# Modified by XuDong Wang from https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/dataset_mapper.py
-import copy
-import logging
-import numpy as np
-from typing import List, Optional, Union
-import torch
-from detectron2.config import configurable
-import data.detection_utils as utils
-import data.transforms as T
-"""
-This file contains the default mapping that's applied to "dataset dicts".
-"""
-__all__ = ["DatasetMapper"]
-class DatasetMapper:
-    """
-    A callable which takes a dataset dict in Detectron2 Dataset format,
-    and map it into a format used by the model.
-    This is the default callable to be used to map your dataset dict into training data.
-    You may need to follow it to implement your own one for customized logic,
-    such as a different way to read or transform images.
-    See :doc:`/tutorials/data_loading` for details.
-    The callable currently does the following:
-    1. Read the image from "file_name"
-    2. Applies cropping/geometric transforms to the image and annotations
-    3. Prepare data and annotations to Tensor and :class:`Instances`
-    """
-    @configurable
-    def __init__(
-        self,
-        is_train: bool,
-        *,
-        augmentations: List[Union[T.Augmentation, T.Transform]],
-        image_format: str,
-        use_instance_mask: bool = False,
-        use_keypoint: bool = False,
-        instance_mask_format: str = "polygon",
-        keypoint_hflip_indices: Optional[np.ndarray] = None,
-        precomputed_proposal_topk: Optional[int] = None,
-        recompute_boxes: bool = False,
-    ):
-        """
-        NOTE: this interface is experimental.
-        Args:
-            is_train: whether it's used in training or inference
-            augmentations: a list of augmentations or deterministic transforms to apply
-            image_format: an image format supported by :func:`detection_utils.read_image`.
-            use_instance_mask: whether to process instance segmentation annotations, if available
-            use_keypoint: whether to process keypoint annotations if available
-            instance_mask_format: one of "polygon" or "bitmask". Process instance segmentation
-                masks into this format.
-            keypoint_hflip_indices: see :func:`detection_utils.create_keypoint_hflip_indices`
-            precomputed_proposal_topk: if given, will load pre-computed
-                proposals from dataset_dict and keep the top k proposals for each image.
-            recompute_boxes: whether to overwrite bounding box annotations
-                by computing tight bounding boxes from instance mask annotations.
-        """
-        if recompute_boxes:
-            assert use_instance_mask, "recompute_boxes requires instance masks"
-        # fmt: off
-        self.is_train               = is_train
-        self.augmentations          = T.AugmentationList(augmentations)
-        self.image_format           = image_format
-        self.use_instance_mask      = use_instance_mask
-        self.instance_mask_format   = instance_mask_format
-        self.use_keypoint           = use_keypoint
-        self.keypoint_hflip_indices = keypoint_hflip_indices
-        self.proposal_topk          = precomputed_proposal_topk
-        self.recompute_boxes        = recompute_boxes
-        # fmt: on
-        logger = logging.getLogger(__name__)
-        mode = "training" if is_train else "inference"
-        logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augmentations}")
-    @classmethod
-    def from_config(cls, cfg, is_train: bool = True):
-        augs = utils.build_augmentation(cfg, is_train)
-        if cfg.INPUT.CROP.ENABLED and is_train:
-            augs.insert(0, T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE))
-            recompute_boxes = cfg.MODEL.MASK_ON
-        else:
-            recompute_boxes = False
-        ret = {
-            "is_train": is_train,
-            "augmentations": augs,
-            "image_format": cfg.INPUT.FORMAT,
-            "use_instance_mask": cfg.MODEL.MASK_ON,
-            "instance_mask_format": cfg.INPUT.MASK_FORMAT,
-            "use_keypoint": cfg.MODEL.KEYPOINT_ON,
-            "recompute_boxes": recompute_boxes,
-        }
-        if cfg.MODEL.KEYPOINT_ON:
-            ret["keypoint_hflip_indices"] = utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN)
-        if cfg.MODEL.LOAD_PROPOSALS:
-            ret["precomputed_proposal_topk"] = (
-                cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN
-                if is_train
-                else cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST
-            )
-        return ret
-    def _transform_annotations(self, dataset_dict, transforms, image_shape):
-        # USER: Modify this if you want to keep them for some reason.
-        for anno in dataset_dict["annotations"]:
-            if not self.use_instance_mask:
-                anno.pop("segmentation", None)
-            if not self.use_keypoint:
-                anno.pop("keypoints", None)
-        # USER: Implement additional transformations if you have other types of data
-        annos = [
-            utils.transform_instance_annotations(
-                obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices
-            )
-            for obj in dataset_dict.pop("annotations")
-            if obj.get("iscrowd", 0) == 0
-        ]
-        instances = utils.annotations_to_instances(
-            annos, image_shape, mask_format=self.instance_mask_format
-        )
-        # After transforms such as cropping are applied, the bounding box may no longer
-        # tightly bound the object. As an example, imagine a triangle object
-        # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
-        # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
-        # the intersection of original bounding box and the cropping box.
-        if self.recompute_boxes:
-            instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
-        dataset_dict["instances"] = utils.filter_empty_instances(instances)
-    def __call__(self, dataset_dict):
-        """
-        Args:
-            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
-        Returns:
-            dict: a format that builtin models in detectron2 accept
-        """
-        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
-        # USER: Write your own image loading if it's not from a file
-        image = utils.read_image(dataset_dict["file_name"], format=self.image_format)
-        utils.check_image_size(dataset_dict, image)
-        # USER: Remove if you don't do semantic/panoptic segmentation.
-        if "sem_seg_file_name" in dataset_dict:
-            sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name"), "L").squeeze(2)
-        else:
-            sem_seg_gt = None
-        aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
-        transforms = self.augmentations(aug_input)
-        image, sem_seg_gt = aug_input.image, aug_input.sem_seg
-        image_shape = image.shape[:2]  # h, w
-        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
-        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
-        # Therefore it's important to use torch.Tensor.
-        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
-        if sem_seg_gt is not None:
-            dataset_dict["sem_seg"] = torch.as_tensor(sem_seg_gt.astype("long"))
-        # USER: Remove if you don't use pre-computed proposals.
-        # Most users would not need this feature.
-        if self.proposal_topk is not None:
-            utils.transform_proposals(
-                dataset_dict, image_shape, transforms, proposal_topk=self.proposal_topk
-            )
-        if not self.is_train:
-            # USER: Modify this if you want to keep them for some reason.
-            dataset_dict.pop("annotations", None)
-            dataset_dict.pop("sem_seg_file_name", None)
-            return dataset_dict
-        if "annotations" in dataset_dict:
-            self._transform_annotations(dataset_dict, transforms, image_shape)
-        return dataset_dict

cutler/data/datasets/__init__.py DELETED Viewed

@@ -1,16 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-from .coco import load_coco_json, load_sem_seg, register_coco_instances, convert_to_coco_json
-from .builtin import (
-    register_all_imagenet,
-    register_all_uvo,
-    register_all_coco_ca,
-    register_all_coco_semi,
-    register_all_lvis,
-    register_all_voc,
-    register_all_cross_domain,
-    register_all_kitti,
-    register_all_objects365,
-    register_all_openimages,
-    )
-__all__ = [k for k in globals().keys() if not k.startswith("_")]

cutler/data/datasets/builtin.py DELETED Viewed

@@ -1,216 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# Modified by XuDong Wang from https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/datasets/builtin.py
-"""
-This file registers pre-defined datasets at hard-coded paths, and their metadata.
-We hard-code metadata for common datasets. This will enable:
-1. Consistency check when loading the datasets
-2. Use models on these standard datasets directly and run demos,
-   without having to download the dataset annotations
-We hard-code some paths to the dataset that's assumed to
-exist in "./datasets/".
-Users SHOULD NOT use this file to create new dataset / metadata for new dataset.
-To add new dataset, refer to the tutorial "docs/DATASETS.md".
-"""
-import os
-from .builtin_meta import _get_builtin_metadata
-from .coco import register_coco_instances
-# ==== Predefined datasets and splits for COCO ==========
-_PREDEFINED_SPLITS_COCO_SEMI = {}
-_PREDEFINED_SPLITS_COCO_SEMI["coco_semi"] = {
-    # we use seed 42 to be consistent with previous works on SSL detection and segmentation
-    "coco_semi_1perc": ("coco/train2017", "coco/annotations/1perc_instances_train2017.json"),
-    "coco_semi_2perc": ("coco/train2017", "coco/annotations/2perc_instances_train2017.json"),
-    "coco_semi_5perc": ("coco/train2017", "coco/annotations/5perc_instances_train2017.json"),
-    "coco_semi_10perc": ("coco/train2017", "coco/annotations/10perc_instances_train2017.json"),
-    "coco_semi_20perc": ("coco/train2017", "coco/annotations/20perc_instances_train2017.json"),
-    "coco_semi_30perc": ("coco/train2017", "coco/annotations/30perc_instances_train2017.json"),
-    "coco_semi_40perc": ("coco/train2017", "coco/annotations/40perc_instances_train2017.json"),
-    "coco_semi_50perc": ("coco/train2017", "coco/annotations/50perc_instances_train2017.json"),
-    "coco_semi_60perc": ("coco/train2017", "coco/annotations/60perc_instances_train2017.json"),
-    "coco_semi_80perc": ("coco/train2017", "coco/annotations/80perc_instances_train2017.json"),
-}
-_PREDEFINED_SPLITS_COCO_CA = {}
-_PREDEFINED_SPLITS_COCO_CA["coco_cls_agnostic"] = {
-    "cls_agnostic_coco": ("coco/val2017", "coco/annotations/coco_cls_agnostic_instances_val2017.json"),
-    "cls_agnostic_coco20k": ("coco/train2014", "coco/annotations/coco20k_trainval_gt.json"),
-}
-_PREDEFINED_SPLITS_IMAGENET = {}
-_PREDEFINED_SPLITS_IMAGENET["imagenet"] = {
-    # maskcut annotations
-    "imagenet_train": ("imagenet/train", "imagenet/annotations/imagenet_train_fixsize480_tau0.15_N3.json"),
-    # self-training round 1
-    "imagenet_train_r1": ("imagenet/train", "imagenet/annotations/cutler_imagenet1k_train_r1.json"),
-    # self-training round 2
-    "imagenet_train_r2": ("imagenet/train", "imagenet/annotations/cutler_imagenet1k_train_r2.json"),
-    # self-training round 3
-    "imagenet_train_r3": ("imagenet/train", "imagenet/annotations/cutler_imagenet1k_train_r3.json"),
-}
-_PREDEFINED_SPLITS_VOC = {}
-_PREDEFINED_SPLITS_VOC["voc"] = {
-    'cls_agnostic_voc': ("voc/", "voc/annotations/trainvaltest_2007_cls_agnostic.json"),
-}
-_PREDEFINED_SPLITS_CROSSDOMAIN = {}
-_PREDEFINED_SPLITS_CROSSDOMAIN["cross_domain"] = {
-    'cls_agnostic_clipart': ("clipart/", "clipart/annotations/traintest_cls_agnostic.json"),
-    'cls_agnostic_watercolor': ("watercolor/", "watercolor/annotations/traintest_cls_agnostic.json"),
-    'cls_agnostic_comic': ("comic/", "comic/annotations/traintest_cls_agnostic.json"),
-}
-_PREDEFINED_SPLITS_KITTI = {}
-_PREDEFINED_SPLITS_KITTI["kitti"] = {
-    'cls_agnostic_kitti': ("kitti/", "kitti/annotations/trainval_cls_agnostic.json"),
-}
-_PREDEFINED_SPLITS_LVIS = {}
-_PREDEFINED_SPLITS_LVIS["lvis"] = {
-    "cls_agnostic_lvis": ("coco/", "coco/annotations/lvis1.0_cocofied_val_cls_agnostic.json"),
-}
-_PREDEFINED_SPLITS_OBJECTS365 = {}
-_PREDEFINED_SPLITS_OBJECTS365["objects365"] = {
-    'cls_agnostic_objects365': ("objects365/val", "objects365/annotations/zhiyuan_objv2_val_cls_agnostic.json"),
-}
-_PREDEFINED_SPLITS_OpenImages = {}
-_PREDEFINED_SPLITS_OpenImages["openimages"] = {
-    'cls_agnostic_openimages': ("openImages/validation", "openImages/annotations/openimages_val_cls_agnostic.json"),
-}
-_PREDEFINED_SPLITS_UVO = {}
-_PREDEFINED_SPLITS_UVO["uvo"] = {
-    "cls_agnostic_uvo": ("uvo/all_UVO_frames", "uvo/annotations/val_sparse_cleaned_cls_agnostic.json"),
-}
-def register_all_imagenet(root):
-    for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_IMAGENET.items():
-        for key, (image_root, json_file) in splits_per_dataset.items():
-            # Assume pre-defined datasets live in `./datasets`.
-            register_coco_instances(
-                key,
-                _get_builtin_metadata(dataset_name),
-                os.path.join(root, json_file) if "://" not in json_file else json_file,
-                os.path.join(root, image_root),
-            )
-def register_all_voc(root):
-    for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_VOC.items():
-        for key, (image_root, json_file) in splits_per_dataset.items():
-            # Assume pre-defined datasets live in `./datasets`.
-            register_coco_instances(
-                key,
-                _get_builtin_metadata(dataset_name),
-                os.path.join(root, json_file) if "://" not in json_file else json_file,
-                os.path.join(root, image_root),
-            )
-def register_all_cross_domain(root):
-    for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_CROSSDOMAIN.items():
-        for key, (image_root, json_file) in splits_per_dataset.items():
-            # Assume pre-defined datasets live in `./datasets`.
-            register_coco_instances(
-                key,
-                _get_builtin_metadata(dataset_name),
-                os.path.join(root, json_file) if "://" not in json_file else json_file,
-                os.path.join(root, image_root),
-            )
-def register_all_kitti(root):
-    for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_KITTI.items():
-        for key, (image_root, json_file) in splits_per_dataset.items():
-            # Assume pre-defined datasets live in `./datasets`.
-            register_coco_instances(
-                key,
-                _get_builtin_metadata(dataset_name),
-                os.path.join(root, json_file) if "://" not in json_file else json_file,
-                os.path.join(root, image_root),
-            )
-def register_all_objects365(root):
-    for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_OBJECTS365.items():
-        for key, (image_root, json_file) in splits_per_dataset.items():
-            # Assume pre-defined datasets live in `./datasets`.
-            register_coco_instances(
-                key,
-                _get_builtin_metadata(dataset_name),
-                os.path.join(root, json_file) if "://" not in json_file else json_file,
-                os.path.join(root, image_root),
-            )
-def register_all_openimages(root):
-    for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_OpenImages.items():
-        for key, (image_root, json_file) in splits_per_dataset.items():
-            # Assume pre-defined datasets live in `./datasets`.
-            register_coco_instances(
-                key,
-                _get_builtin_metadata(dataset_name),
-                os.path.join(root, json_file) if "://" not in json_file else json_file,
-                os.path.join(root, image_root),
-            )
-def register_all_lvis(root):
-    for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_LVIS.items():
-        for key, (image_root, json_file) in splits_per_dataset.items():
-            # Assume pre-defined datasets live in `./datasets`.
-            register_coco_instances(
-                key,
-                _get_builtin_metadata(dataset_name),
-                os.path.join(root, json_file) if "://" not in json_file else json_file,
-                os.path.join(root, image_root),
-            )
-def register_all_uvo(root):
-    for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_UVO.items():
-        for key, (image_root, json_file) in splits_per_dataset.items():
-            # Assume pre-defined datasets live in `./datasets`.
-            register_coco_instances(
-                key,
-                _get_builtin_metadata(dataset_name),
-                os.path.join(root, json_file) if "://" not in json_file else json_file,
-                os.path.join(root, image_root),
-            )
-def register_all_coco_semi(root):
-    for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_COCO_SEMI.items():
-        for key, (image_root, json_file) in splits_per_dataset.items():
-            # Assume pre-defined datasets live in `./datasets`.
-            register_coco_instances(
-                key,
-                _get_builtin_metadata(dataset_name),
-                os.path.join(root, json_file) if "://" not in json_file else json_file,
-                os.path.join(root, image_root),
-            )
-def register_all_coco_ca(root):
-    for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_COCO_CA.items():
-        for key, (image_root, json_file) in splits_per_dataset.items():
-            # Assume pre-defined datasets live in `./datasets`.
-            register_coco_instances(
-                key,
-                _get_builtin_metadata(dataset_name),
-                os.path.join(root, json_file) if "://" not in json_file else json_file,
-                os.path.join(root, image_root),
-            )
-_root = os.path.expanduser(os.getenv("DETECTRON2_DATASETS", "datasets"))
-register_all_coco_semi(_root)
-register_all_coco_ca(_root)
-register_all_imagenet(_root)
-register_all_uvo(_root)
-register_all_voc(_root)
-register_all_cross_domain(_root)
-register_all_kitti(_root)
-register_all_openimages(_root)
-register_all_objects365(_root)
-register_all_lvis(_root)

cutler/data/datasets/builtin_meta.py DELETED Viewed

@@ -1,389 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# Modified by XuDong Wang from https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/datasets/builtin_meta.py
-"""
-Note:
-For your custom dataset, there is no need to hard-code metadata anywhere in the code.
-For example, for COCO-format dataset, metadata will be obtained automatically
-when calling `load_coco_json`. For other dataset, metadata may also be obtained in other ways
-during loading.
-However, we hard-coded metadata for a few common dataset here.
-The only goal is to allow users who don't have these dataset to use pre-trained models.
-Users don't have to download a COCO json (which contains metadata), in order to visualize a
-COCO model (with correct class names and colors).
-"""
-# All coco categories, together with their nice-looking visualization colors
-# It's from https://github.com/cocodataset/panopticapi/blob/master/panoptic_coco_categories.json
-COCO_CATEGORIES = [
-    {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"},
-    {"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"},
-    {"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"},
-    {"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"},
-    {"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"},
-    {"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"},
-    {"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"},
-    {"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"},
-    {"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"},
-    {"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"},
-    {"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"},
-    {"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"},
-    {"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"},
-    {"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"},
-    {"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"},
-    {"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"},
-    {"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"},
-    {"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"},
-    {"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"},
-    {"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"},
-    {"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"},
-    {"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"},
-    {"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"},
-    {"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"},
-    {"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"},
-    {"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"},
-    {"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"},
-    {"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"},
-    {"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"},
-    {"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"},
-    {"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"},
-    {"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"},
-    {"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"},
-    {"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"},
-    {"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"},
-    {"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"},
-    {"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"},
-    {"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"},
-    {"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"},
-    {"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"},
-    {"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"},
-    {"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"},
-    {"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"},
-    {"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"},
-    {"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"},
-    {"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"},
-    {"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"},
-    {"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"},
-    {"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"},
-    {"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"},
-    {"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"},
-    {"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"},
-    {"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"},
-    {"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"},
-    {"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"},
-    {"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"},
-    {"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"},
-    {"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"},
-    {"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"},
-    {"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"},
-    {"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"},
-    {"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"},
-    {"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"},
-    {"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"},
-    {"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"},
-    {"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"},
-    {"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"},
-    {"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"},
-    {"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"},
-    {"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"},
-    {"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"},
-    {"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"},
-    {"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"},
-    {"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"},
-    {"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"},
-    {"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"},
-    {"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"},
-    {"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"},
-    {"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"},
-    {"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"},
-    {"color": [255, 255, 128], "isthing": 0, "id": 92, "name": "banner"},
-    {"color": [147, 211, 203], "isthing": 0, "id": 93, "name": "blanket"},
-    {"color": [150, 100, 100], "isthing": 0, "id": 95, "name": "bridge"},
-    {"color": [168, 171, 172], "isthing": 0, "id": 100, "name": "cardboard"},
-    {"color": [146, 112, 198], "isthing": 0, "id": 107, "name": "counter"},
-    {"color": [210, 170, 100], "isthing": 0, "id": 109, "name": "curtain"},
-    {"color": [92, 136, 89], "isthing": 0, "id": 112, "name": "door-stuff"},
-    {"color": [218, 88, 184], "isthing": 0, "id": 118, "name": "floor-wood"},
-    {"color": [241, 129, 0], "isthing": 0, "id": 119, "name": "flower"},
-    {"color": [217, 17, 255], "isthing": 0, "id": 122, "name": "fruit"},
-    {"color": [124, 74, 181], "isthing": 0, "id": 125, "name": "gravel"},
-    {"color": [70, 70, 70], "isthing": 0, "id": 128, "name": "house"},
-    {"color": [255, 228, 255], "isthing": 0, "id": 130, "name": "light"},
-    {"color": [154, 208, 0], "isthing": 0, "id": 133, "name": "mirror-stuff"},
-    {"color": [193, 0, 92], "isthing": 0, "id": 138, "name": "net"},
-    {"color": [76, 91, 113], "isthing": 0, "id": 141, "name": "pillow"},
-    {"color": [255, 180, 195], "isthing": 0, "id": 144, "name": "platform"},
-    {"color": [106, 154, 176], "isthing": 0, "id": 145, "name": "playingfield"},
-    {"color": [230, 150, 140], "isthing": 0, "id": 147, "name": "railroad"},
-    {"color": [60, 143, 255], "isthing": 0, "id": 148, "name": "river"},
-    {"color": [128, 64, 128], "isthing": 0, "id": 149, "name": "road"},
-    {"color": [92, 82, 55], "isthing": 0, "id": 151, "name": "roof"},
-    {"color": [254, 212, 124], "isthing": 0, "id": 154, "name": "sand"},
-    {"color": [73, 77, 174], "isthing": 0, "id": 155, "name": "sea"},
-    {"color": [255, 160, 98], "isthing": 0, "id": 156, "name": "shelf"},
-    {"color": [255, 255, 255], "isthing": 0, "id": 159, "name": "snow"},
-    {"color": [104, 84, 109], "isthing": 0, "id": 161, "name": "stairs"},
-    {"color": [169, 164, 131], "isthing": 0, "id": 166, "name": "tent"},
-    {"color": [225, 199, 255], "isthing": 0, "id": 168, "name": "towel"},
-    {"color": [137, 54, 74], "isthing": 0, "id": 171, "name": "wall-brick"},
-    {"color": [135, 158, 223], "isthing": 0, "id": 175, "name": "wall-stone"},
-    {"color": [7, 246, 231], "isthing": 0, "id": 176, "name": "wall-tile"},
-    {"color": [107, 255, 200], "isthing": 0, "id": 177, "name": "wall-wood"},
-    {"color": [58, 41, 149], "isthing": 0, "id": 178, "name": "water-other"},
-    {"color": [183, 121, 142], "isthing": 0, "id": 180, "name": "window-blind"},
-    {"color": [255, 73, 97], "isthing": 0, "id": 181, "name": "window-other"},
-    {"color": [107, 142, 35], "isthing": 0, "id": 184, "name": "tree-merged"},
-    {"color": [190, 153, 153], "isthing": 0, "id": 185, "name": "fence-merged"},
-    {"color": [146, 139, 141], "isthing": 0, "id": 186, "name": "ceiling-merged"},
-    {"color": [70, 130, 180], "isthing": 0, "id": 187, "name": "sky-other-merged"},
-    {"color": [134, 199, 156], "isthing": 0, "id": 188, "name": "cabinet-merged"},
-    {"color": [209, 226, 140], "isthing": 0, "id": 189, "name": "table-merged"},
-    {"color": [96, 36, 108], "isthing": 0, "id": 190, "name": "floor-other-merged"},
-    {"color": [96, 96, 96], "isthing": 0, "id": 191, "name": "pavement-merged"},
-    {"color": [64, 170, 64], "isthing": 0, "id": 192, "name": "mountain-merged"},
-    {"color": [152, 251, 152], "isthing": 0, "id": 193, "name": "grass-merged"},
-    {"color": [208, 229, 228], "isthing": 0, "id": 194, "name": "dirt-merged"},
-    {"color": [206, 186, 171], "isthing": 0, "id": 195, "name": "paper-merged"},
-    {"color": [152, 161, 64], "isthing": 0, "id": 196, "name": "food-other-merged"},
-    {"color": [116, 112, 0], "isthing": 0, "id": 197, "name": "building-other-merged"},
-    {"color": [0, 114, 143], "isthing": 0, "id": 198, "name": "rock-merged"},
-    {"color": [102, 102, 156], "isthing": 0, "id": 199, "name": "wall-other-merged"},
-    {"color": [250, 141, 255], "isthing": 0, "id": 200, "name": "rug-merged"},
-]
-IMAGENET_CATEGORIES = [
-    {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "fg"},
-]
-UVO_CATEGORIES = [
-    {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "object"},
-]
-# fmt: off
-COCO_PERSON_KEYPOINT_NAMES = (
-    "nose",
-    "left_eye", "right_eye",
-    "left_ear", "right_ear",
-    "left_shoulder", "right_shoulder",
-    "left_elbow", "right_elbow",
-    "left_wrist", "right_wrist",
-    "left_hip", "right_hip",
-    "left_knee", "right_knee",
-    "left_ankle", "right_ankle",
-)
-# fmt: on
-# Pairs of keypoints that should be exchanged under horizontal flipping
-COCO_PERSON_KEYPOINT_FLIP_MAP = (
-    ("left_eye", "right_eye"),
-    ("left_ear", "right_ear"),
-    ("left_shoulder", "right_shoulder"),
-    ("left_elbow", "right_elbow"),
-    ("left_wrist", "right_wrist"),
-    ("left_hip", "right_hip"),
-    ("left_knee", "right_knee"),
-    ("left_ankle", "right_ankle"),
-)
-# rules for pairs of keypoints to draw a line between, and the line color to use.
-KEYPOINT_CONNECTION_RULES = [
-    # face
-    ("left_ear", "left_eye", (102, 204, 255)),
-    ("right_ear", "right_eye", (51, 153, 255)),
-    ("left_eye", "nose", (102, 0, 204)),
-    ("nose", "right_eye", (51, 102, 255)),
-    # upper-body
-    ("left_shoulder", "right_shoulder", (255, 128, 0)),
-    ("left_shoulder", "left_elbow", (153, 255, 204)),
-    ("right_shoulder", "right_elbow", (128, 229, 255)),
-    ("left_elbow", "left_wrist", (153, 255, 153)),
-    ("right_elbow", "right_wrist", (102, 255, 224)),
-    # lower-body
-    ("left_hip", "right_hip", (255, 102, 0)),
-    ("left_hip", "left_knee", (255, 255, 77)),
-    ("right_hip", "right_knee", (153, 255, 204)),
-    ("left_knee", "left_ankle", (191, 255, 128)),
-    ("right_knee", "right_ankle", (255, 195, 77)),
-]
-# All Cityscapes categories, together with their nice-looking visualization colors
-# It's from https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/helpers/labels.py  # noqa
-CITYSCAPES_CATEGORIES = [
-    {"color": (128, 64, 128), "isthing": 0, "id": 7, "trainId": 0, "name": "road"},
-    {"color": (244, 35, 232), "isthing": 0, "id": 8, "trainId": 1, "name": "sidewalk"},
-    {"color": (70, 70, 70), "isthing": 0, "id": 11, "trainId": 2, "name": "building"},
-    {"color": (102, 102, 156), "isthing": 0, "id": 12, "trainId": 3, "name": "wall"},
-    {"color": (190, 153, 153), "isthing": 0, "id": 13, "trainId": 4, "name": "fence"},
-    {"color": (153, 153, 153), "isthing": 0, "id": 17, "trainId": 5, "name": "pole"},
-    {"color": (250, 170, 30), "isthing": 0, "id": 19, "trainId": 6, "name": "traffic light"},
-    {"color": (220, 220, 0), "isthing": 0, "id": 20, "trainId": 7, "name": "traffic sign"},
-    {"color": (107, 142, 35), "isthing": 0, "id": 21, "trainId": 8, "name": "vegetation"},
-    {"color": (152, 251, 152), "isthing": 0, "id": 22, "trainId": 9, "name": "terrain"},
-    {"color": (70, 130, 180), "isthing": 0, "id": 23, "trainId": 10, "name": "sky"},
-    {"color": (220, 20, 60), "isthing": 1, "id": 24, "trainId": 11, "name": "person"},
-    {"color": (255, 0, 0), "isthing": 1, "id": 25, "trainId": 12, "name": "rider"},
-    {"color": (0, 0, 142), "isthing": 1, "id": 26, "trainId": 13, "name": "car"},
-    {"color": (0, 0, 70), "isthing": 1, "id": 27, "trainId": 14, "name": "truck"},
-    {"color": (0, 60, 100), "isthing": 1, "id": 28, "trainId": 15, "name": "bus"},
-    {"color": (0, 80, 100), "isthing": 1, "id": 31, "trainId": 16, "name": "train"},
-    {"color": (0, 0, 230), "isthing": 1, "id": 32, "trainId": 17, "name": "motorcycle"},
-    {"color": (119, 11, 32), "isthing": 1, "id": 33, "trainId": 18, "name": "bicycle"},
-]
-# fmt: off
-ADE20K_SEM_SEG_CATEGORIES = [
-    "wall", "building", "sky", "floor", "tree", "ceiling", "road, route", "bed", "window ", "grass", "cabinet", "sidewalk, pavement", "person", "earth, ground", "door", "table", "mountain, mount", "plant", "curtain", "chair", "car", "water", "painting, picture", "sofa", "shelf", "house", "sea", "mirror", "rug", "field", "armchair", "seat", "fence", "desk", "rock, stone", "wardrobe, closet, press", "lamp", "tub", "rail", "cushion", "base, pedestal, stand", "box", "column, pillar", "signboard, sign", "chest of drawers, chest, bureau, dresser", "counter", "sand", "sink", "skyscraper", "fireplace", "refrigerator, icebox", "grandstand, covered stand", "path", "stairs", "runway", "case, display case, showcase, vitrine", "pool table, billiard table, snooker table", "pillow", "screen door, screen", "stairway, staircase", "river", "bridge, span", "bookcase", "blind, screen", "coffee table", "toilet, can, commode, crapper, pot, potty, stool, throne", "flower", "book", "hill", "bench", "countertop", "stove", "palm, palm tree", "kitchen island", "computer", "swivel chair", "boat", "bar", "arcade machine", "hovel, hut, hutch, shack, shanty", "bus", "towel", "light", "truck", "tower", "chandelier", "awning, sunshade, sunblind", "street lamp", "booth", "tv", "plane", "dirt track", "clothes", "pole", "land, ground, soil", "bannister, banister, balustrade, balusters, handrail", "escalator, moving staircase, moving stairway", "ottoman, pouf, pouffe, puff, hassock", "bottle", "buffet, counter, sideboard", "poster, posting, placard, notice, bill, card", "stage", "van", "ship", "fountain", "conveyer belt, conveyor belt, conveyer, conveyor, transporter", "canopy", "washer, automatic washer, washing machine", "plaything, toy", "pool", "stool", "barrel, cask", "basket, handbasket", "falls", "tent", "bag", "minibike, motorbike", "cradle", "oven", "ball", "food, solid food", "step, stair", "tank, storage tank", "trade name", "microwave", "pot", "animal", "bicycle", "lake", "dishwasher", "screen", "blanket, cover", "sculpture", "hood, exhaust hood", "sconce", "vase", "traffic light", "tray", "trash can", "fan", "pier", "crt screen", "plate", "monitor", "bulletin board", "shower", "radiator", "glass, drinking glass", "clock", "flag", # noqa
-]
-# After processed by `prepare_ade20k_sem_seg.py`, id 255 means ignore
-# fmt: on
-def _get_coco_instances_meta():
-    thing_ids = [k["id"] for k in COCO_CATEGORIES if k["isthing"] == 1]
-    thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1]
-    assert len(thing_ids) == 80, len(thing_ids)
-    # Mapping from the incontiguous COCO category id to an id in [0, 79]
-    thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
-    thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1]
-    ret = {
-        "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
-        "thing_classes": thing_classes,
-        "thing_colors": thing_colors,
-    }
-    return ret
-def _get_imagenet_instances_meta():
-    thing_ids = [k["id"] for k in IMAGENET_CATEGORIES if k["isthing"] == 1]
-    thing_colors = [k["color"] for k in IMAGENET_CATEGORIES if k["isthing"] == 1]
-    assert len(thing_ids) == 1, len(thing_ids)
-    thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
-    thing_classes = [k["name"] for k in IMAGENET_CATEGORIES if k["isthing"] == 1]
-    ret = {
-        "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
-        "thing_classes": thing_classes,
-        "thing_colors": thing_colors,
-        "class_image_count":  [{'id': 1, 'image_count': 116986}]
-    }
-    return ret
-def _get_UVO_instances_meta():
-    thing_ids = [k["id"] for k in UVO_CATEGORIES if k["isthing"] == 1]
-    thing_colors = [k["color"] for k in UVO_CATEGORIES if k["isthing"] == 1]
-    assert len(thing_ids) == 1, len(thing_ids)
-    thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
-    thing_classes = [k["name"] for k in UVO_CATEGORIES if k["isthing"] == 1]
-    ret = {
-        "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
-        "thing_classes": thing_classes,
-        "thing_colors": thing_colors,
-        "class_image_count":  [{'id': 1, 'image_count': 116986}]
-    }
-    return ret
-def _get_coco_panoptic_separated_meta():
-    """
-    Returns metadata for "separated" version of the panoptic segmentation dataset.
-    """
-    stuff_ids = [k["id"] for k in COCO_CATEGORIES if k["isthing"] == 0]
-    assert len(stuff_ids) == 53, len(stuff_ids)
-    # For semantic segmentation, this mapping maps from contiguous stuff id
-    # (in [0, 53], used in models) to ids in the dataset (used for processing results)
-    # The id 0 is mapped to an extra category "thing".
-    stuff_dataset_id_to_contiguous_id = {k: i + 1 for i, k in enumerate(stuff_ids)}
-    # When converting COCO panoptic annotations to semantic annotations
-    # We label the "thing" category to 0
-    stuff_dataset_id_to_contiguous_id[0] = 0
-    # 54 names for COCO stuff categories (including "things")
-    stuff_classes = ["things"] + [
-        k["name"].replace("-other", "").replace("-merged", "")
-        for k in COCO_CATEGORIES
-        if k["isthing"] == 0
-    ]
-    # NOTE: I randomly picked a color for things
-    stuff_colors = [[82, 18, 128]] + [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 0]
-    ret = {
-        "stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id,
-        "stuff_classes": stuff_classes,
-        "stuff_colors": stuff_colors,
-    }
-    ret.update(_get_coco_instances_meta())
-    return ret
-def _get_builtin_metadata(dataset_name):
-    if dataset_name in ["coco", "coco_semi"]:
-        return _get_coco_instances_meta()
-    if dataset_name == "coco_panoptic_separated":
-        return _get_coco_panoptic_separated_meta()
-    elif dataset_name in ["imagenet", "kitti", "cross_domain", "lvis", "voc", "coco_cls_agnostic", "objects365", 'openimages']:
-        return _get_imagenet_instances_meta()
-    elif dataset_name == "uvo":
-        return _get_UVO_instances_meta()
-    elif dataset_name == "coco_panoptic_standard":
-        meta = {}
-        # The following metadata maps contiguous id from [0, #thing categories +
-        # #stuff categories) to their names and colors. We have to replica of the
-        # same name and color under "thing_*" and "stuff_*" because the current
-        # visualization function in D2 handles thing and class classes differently
-        # due to some heuristic used in Panoptic FPN. We keep the same naming to
-        # enable reusing existing visualization functions.
-        thing_classes = [k["name"] for k in COCO_CATEGORIES]
-        thing_colors = [k["color"] for k in COCO_CATEGORIES]
-        stuff_classes = [k["name"] for k in COCO_CATEGORIES]
-        stuff_colors = [k["color"] for k in COCO_CATEGORIES]
-        meta["thing_classes"] = thing_classes
-        meta["thing_colors"] = thing_colors
-        meta["stuff_classes"] = stuff_classes
-        meta["stuff_colors"] = stuff_colors
-        # Convert category id for training:
-        #   category id: like semantic segmentation, it is the class id for each
-        #   pixel. Since there are some classes not used in evaluation, the category
-        #   id is not always contiguous and thus we have two set of category ids:
-        #       - original category id: category id in the original dataset, mainly
-        #           used for evaluation.
-        #       - contiguous category id: [0, #classes), in order to train the linear
-        #           softmax classifier.
-        thing_dataset_id_to_contiguous_id = {}
-        stuff_dataset_id_to_contiguous_id = {}
-        for i, cat in enumerate(COCO_CATEGORIES):
-            if cat["isthing"]:
-                thing_dataset_id_to_contiguous_id[cat["id"]] = i
-            else:
-                stuff_dataset_id_to_contiguous_id[cat["id"]] = i
-        meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
-        meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
-        return meta
-    elif dataset_name == "coco_person":
-        return {
-            "thing_classes": ["person"],
-            "keypoint_names": COCO_PERSON_KEYPOINT_NAMES,
-            "keypoint_flip_map": COCO_PERSON_KEYPOINT_FLIP_MAP,
-            "keypoint_connection_rules": KEYPOINT_CONNECTION_RULES,
-        }
-    elif dataset_name == "cityscapes":
-        # fmt: off
-        CITYSCAPES_THING_CLASSES = [
-            "person", "rider", "car", "truck",
-            "bus", "train", "motorcycle", "bicycle",
-        ]
-        CITYSCAPES_STUFF_CLASSES = [
-            "road", "sidewalk", "building", "wall", "fence", "pole", "traffic light",
-            "traffic sign", "vegetation", "terrain", "sky", "person", "rider", "car",
-            "truck", "bus", "train", "motorcycle", "bicycle",
-        ]
-        # fmt: on
-        return {
-            "thing_classes": CITYSCAPES_THING_CLASSES,
-            "stuff_classes": CITYSCAPES_STUFF_CLASSES,
-        }
-    raise KeyError("No built-in metadata for dataset {}".format(dataset_name))

cutler/data/datasets/coco.py DELETED Viewed

@@ -1,544 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# Modified by XuDong Wang from https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/datasets/coco.py
-import contextlib
-import datetime
-import io
-import json
-import logging
-import numpy as np
-import os
-import shutil
-import pycocotools.mask as mask_util
-from fvcore.common.timer import Timer
-from iopath.common.file_io import file_lock
-from PIL import Image
-from detectron2.structures import Boxes, BoxMode, PolygonMasks, RotatedBoxes
-from detectron2.utils.file_io import PathManager
-from detectron2.data import DatasetCatalog, MetadataCatalog
-"""
-This file contains functions to parse COCO-format annotations into dicts in "Detectron2 format".
-"""
-logger = logging.getLogger(__name__)
-__all__ = ["load_coco_json", "load_sem_seg", "convert_to_coco_json", "register_coco_instances"]
-def load_coco_json(json_file, image_root, dataset_name=None, extra_annotation_keys=None):
-    """
-    Load a json file with COCO's instances annotation format.
-    Currently supports instance detection, instance segmentation,
-    and person keypoints annotations.
-    Args:
-        json_file (str): full path to the json file in COCO instances annotation format.
-        image_root (str or path-like): the directory where the images in this json file exists.
-        dataset_name (str or None): the name of the dataset (e.g., coco_2017_train).
-            When provided, this function will also do the following:
-            * Put "thing_classes" into the metadata associated with this dataset.
-            * Map the category ids into a contiguous range (needed by standard dataset format),
-              and add "thing_dataset_id_to_contiguous_id" to the metadata associated
-              with this dataset.
-            This option should usually be provided, unless users need to load
-            the original json content and apply more processing manually.
-        extra_annotation_keys (list[str]): list of per-annotation keys that should also be
-            loaded into the dataset dict (besides "iscrowd", "bbox", "keypoints",
-            "category_id", "segmentation"). The values for these keys will be returned as-is.
-            For example, the densepose annotations are loaded in this way.
-    Returns:
-        list[dict]: a list of dicts in Detectron2 standard dataset dicts format (See
-        `Using Custom Datasets </tutorials/datasets.html>`_ ) when `dataset_name` is not None.
-        If `dataset_name` is None, the returned `category_ids` may be
-        incontiguous and may not conform to the Detectron2 standard format.
-    Notes:
-        1. This function does not read the image files.
-           The results do not have the "image" field.
-    """
-    from pycocotools.coco import COCO
-    timer = Timer()
-    json_file = PathManager.get_local_path(json_file)
-    with contextlib.redirect_stdout(io.StringIO()):
-        coco_api = COCO(json_file)
-    if timer.seconds() > 1:
-        logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
-    id_map = None
-    if dataset_name is not None:
-        meta = MetadataCatalog.get(dataset_name)
-        cat_ids = sorted(coco_api.getCatIds())
-        cats = coco_api.loadCats(cat_ids)
-        # The categories in a custom json file may not be sorted.
-        thing_classes = [c["name"] for c in sorted(cats, key=lambda x: x["id"])]
-        if "imagenet" not in dataset_name and "cls_agnostic" not in dataset_name:
-            meta.thing_classes = thing_classes
-            # In COCO, certain category ids are artificially removed,
-            # and by convention they are always ignored.
-            # We deal with COCO's id issue and translate
-            # the category ids to contiguous ids in [0, 80).
-            # It works by looking at the "categories" field in the json, therefore
-            # if users' own json also have incontiguous ids, we'll
-            # apply this mapping as well but print a warning.
-            if not (min(cat_ids) == 1 and max(cat_ids) == len(cat_ids)):
-                if "coco" not in dataset_name:
-                    logger.warning(
-                        """
-    Category ids in annotations are not in [1, #categories]! We'll apply a mapping for you.
-    """
-                    )
-            id_map = {v: i for i, v in enumerate(cat_ids)}
-            meta.thing_dataset_id_to_contiguous_id = id_map
-        else:
-            id_map = meta.thing_dataset_id_to_contiguous_id
-    # sort indices for reproducible results
-    img_ids = sorted(coco_api.imgs.keys())
-    # imgs is a list of dicts, each looks something like:
-    # {'license': 4,
-    #  'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg',
-    #  'file_name': 'COCO_val2014_000000001268.jpg',
-    #  'height': 427,
-    #  'width': 640,
-    #  'date_captured': '2013-11-17 05:57:24',
-    #  'id': 1268}
-    imgs = coco_api.loadImgs(img_ids)
-    # anns is a list[list[dict]], where each dict is an annotation
-    # record for an object. The inner list enumerates the objects in an image
-    # and the outer list enumerates over images. Example of anns[0]:
-    # [{'segmentation': [[192.81,
-    #     247.09,
-    #     ...
-    #     219.03,
-    #     249.06]],
-    #   'area': 1035.749,
-    #   'iscrowd': 0,
-    #   'image_id': 1268,
-    #   'bbox': [192.81, 224.8, 74.73, 33.43],
-    #   'category_id': 16,
-    #   'id': 42986},
-    #  ...]
-    anns = [coco_api.imgToAnns[img_id] for img_id in img_ids]
-    total_num_valid_anns = sum([len(x) for x in anns])
-    total_num_anns = len(coco_api.anns)
-    if total_num_valid_anns < total_num_anns:
-        logger.warning(
-            f"{json_file} contains {total_num_anns} annotations, but only "
-            f"{total_num_valid_anns} of them match to images in the file."
-        )
-    if "minival" not in json_file:
-        # The popular valminusminival & minival annotations for COCO2014 contain this bug.
-        # However the ratio of buggy annotations there is tiny and does not affect accuracy.
-        # Therefore we explicitly white-list them.
-        ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
-        assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique!".format(
-            json_file
-        )
-    imgs_anns = list(zip(imgs, anns))
-    logger.info("Loaded {} images in COCO format from {}".format(len(imgs_anns), json_file))
-    dataset_dicts = []
-    ann_keys = ["iscrowd", "bbox", "keypoints", "category_id"] + (extra_annotation_keys or [])
-    num_instances_without_valid_segmentation = 0
-    for (img_dict, anno_dict_list) in imgs_anns:
-        record = {}
-        record["file_name"] = os.path.join(image_root, img_dict["file_name"])
-        record["height"] = img_dict["height"]
-        record["width"] = img_dict["width"]
-        image_id = record["image_id"] = img_dict["id"]
-        objs = []
-        for anno in anno_dict_list:
-            # Check that the image_id in this annotation is the same as
-            # the image_id we're looking at.
-            # This fails only when the data parsing logic or the annotation file is buggy.
-            # The original COCO valminusminival2014 & minival2014 annotation files
-            # actually contains bugs that, together with certain ways of using COCO API,
-            # can trigger this assertion.
-            assert anno["image_id"] == image_id
-            assert anno.get("ignore", 0) == 0, '"ignore" in COCO json file is not supported.'
-            obj = {key: anno[key] for key in ann_keys if key in anno}
-            if "bbox" in obj and len(obj["bbox"]) == 0:
-                raise ValueError(
-                    f"One annotation of image {image_id} contains empty 'bbox' value! "
-                    "This json does not have valid COCO format."
-                )
-            segm = anno.get("segmentation", None)
-            if segm:  # either list[list[float]] or dict(RLE)
-                if isinstance(segm, dict):
-                    if isinstance(segm["counts"], list):
-                        # convert to compressed RLE
-                        segm = mask_util.frPyObjects(segm, *segm["size"])
-                else:
-                    # filter out invalid polygons (< 3 points)
-                    segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6]
-                    if len(segm) == 0:
-                        num_instances_without_valid_segmentation += 1
-                        continue  # ignore this instance
-                obj["segmentation"] = segm
-            keypts = anno.get("keypoints", None)
-            if keypts:  # list[int]
-                for idx, v in enumerate(keypts):
-                    if idx % 3 != 2:
-                        # COCO's segmentation coordinates are floating points in [0, H or W],
-                        # but keypoint coordinates are integers in [0, H-1 or W-1]
-                        # Therefore we assume the coordinates are "pixel indices" and
-                        # add 0.5 to convert to floating point coordinates.
-                        keypts[idx] = v + 0.5
-                obj["keypoints"] = keypts
-            obj["bbox_mode"] = BoxMode.XYWH_ABS
-            if id_map:
-                annotation_category_id = obj["category_id"]
-                try:
-                    obj["category_id"] = id_map[annotation_category_id]
-                except KeyError as e:
-                    raise KeyError(
-                        f"Encountered category_id={annotation_category_id} "
-                        "but this id does not exist in 'categories' of the json file."
-                    ) from e
-            objs.append(obj)
-        record["annotations"] = objs
-        dataset_dicts.append(record)
-    if num_instances_without_valid_segmentation > 0:
-        logger.warning(
-            "Filtered out {} instances without valid segmentation. ".format(
-                num_instances_without_valid_segmentation
-            )
-            + "There might be issues in your dataset generation process.  Please "
-            "check https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html carefully"
-        )
-    return dataset_dicts
-def load_sem_seg(gt_root, image_root, gt_ext="png", image_ext="jpg"):
-    """
-    Load semantic segmentation datasets. All files under "gt_root" with "gt_ext" extension are
-    treated as ground truth annotations and all files under "image_root" with "image_ext" extension
-    as input images. Ground truth and input images are matched using file paths relative to
-    "gt_root" and "image_root" respectively without taking into account file extensions.
-    This works for COCO as well as some other datasets.
-    Args:
-        gt_root (str): full path to ground truth semantic segmentation files. Semantic segmentation
-            annotations are stored as images with integer values in pixels that represent
-            corresponding semantic labels.
-        image_root (str): the directory where the input images are.
-        gt_ext (str): file extension for ground truth annotations.
-        image_ext (str): file extension for input images.
-    Returns:
-        list[dict]:
-            a list of dicts in detectron2 standard format without instance-level
-            annotation.
-    Notes:
-        1. This function does not read the image and ground truth files.
-           The results do not have the "image" and "sem_seg" fields.
-    """
-    # We match input images with ground truth based on their relative filepaths (without file
-    # extensions) starting from 'image_root' and 'gt_root' respectively.
-    def file2id(folder_path, file_path):
-        # extract relative path starting from `folder_path`
-        image_id = os.path.normpath(os.path.relpath(file_path, start=folder_path))
-        # remove file extension
-        image_id = os.path.splitext(image_id)[0]
-        return image_id
-    input_files = sorted(
-        (os.path.join(image_root, f) for f in PathManager.ls(image_root) if f.endswith(image_ext)),
-        key=lambda file_path: file2id(image_root, file_path),
-    )
-    gt_files = sorted(
-        (os.path.join(gt_root, f) for f in PathManager.ls(gt_root) if f.endswith(gt_ext)),
-        key=lambda file_path: file2id(gt_root, file_path),
-    )
-    assert len(gt_files) > 0, "No annotations found in {}.".format(gt_root)
-    # Use the intersection, so that val2017_100 annotations can run smoothly with val2017 images
-    if len(input_files) != len(gt_files):
-        logger.warn(
-            "Directory {} and {} has {} and {} files, respectively.".format(
-                image_root, gt_root, len(input_files), len(gt_files)
-            )
-        )
-        input_basenames = [os.path.basename(f)[: -len(image_ext)] for f in input_files]
-        gt_basenames = [os.path.basename(f)[: -len(gt_ext)] for f in gt_files]
-        intersect = list(set(input_basenames) & set(gt_basenames))
-        # sort, otherwise each worker may obtain a list[dict] in different order
-        intersect = sorted(intersect)
-        logger.warn("Will use their intersection of {} files.".format(len(intersect)))
-        input_files = [os.path.join(image_root, f + image_ext) for f in intersect]
-        gt_files = [os.path.join(gt_root, f + gt_ext) for f in intersect]
-    logger.info(
-        "Loaded {} images with semantic segmentation from {}".format(len(input_files), image_root)
-    )
-    dataset_dicts = []
-    for (img_path, gt_path) in zip(input_files, gt_files):
-        record = {}
-        record["file_name"] = img_path
-        record["sem_seg_file_name"] = gt_path
-        dataset_dicts.append(record)
-    return dataset_dicts
-def convert_to_coco_dict(dataset_name):
-    """
-    Convert an instance detection/segmentation or keypoint detection dataset
-    in detectron2's standard format into COCO json format.
-    Generic dataset description can be found here:
-    https://detectron2.readthedocs.io/tutorials/datasets.html#register-a-dataset
-    COCO data format description can be found here:
-    http://cocodataset.org/#format-data
-    Args:
-        dataset_name (str):
-            name of the source dataset
-            Must be registered in DatastCatalog and in detectron2's standard format.
-            Must have corresponding metadata "thing_classes"
-    Returns:
-        coco_dict: serializable dict in COCO json format
-    """
-    dataset_dicts = DatasetCatalog.get(dataset_name)
-    metadata = MetadataCatalog.get(dataset_name)
-    # unmap the category mapping ids for COCO
-    if hasattr(metadata, "thing_dataset_id_to_contiguous_id"):
-        reverse_id_mapping = {v: k for k, v in metadata.thing_dataset_id_to_contiguous_id.items()}
-        reverse_id_mapper = lambda contiguous_id: reverse_id_mapping[contiguous_id]  # noqa
-    else:
-        reverse_id_mapper = lambda contiguous_id: contiguous_id  # noqa
-    categories = [
-        {"id": reverse_id_mapper(id), "name": name}
-        for id, name in enumerate(metadata.thing_classes)
-    ]
-    logger.info("Converting dataset dicts into COCO format")
-    coco_images = []
-    coco_annotations = []
-    for image_id, image_dict in enumerate(dataset_dicts):
-        coco_image = {
-            "id": image_dict.get("image_id", image_id),
-            "width": int(image_dict["width"]),
-            "height": int(image_dict["height"]),
-            "file_name": str(image_dict["file_name"]),
-        }
-        coco_images.append(coco_image)
-        anns_per_image = image_dict.get("annotations", [])
-        for annotation in anns_per_image:
-            # create a new dict with only COCO fields
-            coco_annotation = {}
-            # COCO requirement: XYWH box format for axis-align and XYWHA for rotated
-            bbox = annotation["bbox"]
-            if isinstance(bbox, np.ndarray):
-                if bbox.ndim != 1:
-                    raise ValueError(f"bbox has to be 1-dimensional. Got shape={bbox.shape}.")
-                bbox = bbox.tolist()
-            if len(bbox) not in [4, 5]:
-                raise ValueError(f"bbox has to has length 4 or 5. Got {bbox}.")
-            from_bbox_mode = annotation["bbox_mode"]
-            to_bbox_mode = BoxMode.XYWH_ABS if len(bbox) == 4 else BoxMode.XYWHA_ABS
-            bbox = BoxMode.convert(bbox, from_bbox_mode, to_bbox_mode)
-            # COCO requirement: instance area
-            if "segmentation" in annotation:
-                # Computing areas for instances by counting the pixels
-                segmentation = annotation["segmentation"]
-                # TODO: check segmentation type: RLE, BinaryMask or Polygon
-                if isinstance(segmentation, list):
-                    polygons = PolygonMasks([segmentation])
-                    area = polygons.area()[0].item()
-                elif isinstance(segmentation, dict):  # RLE
-                    area = mask_util.area(segmentation).item()
-                else:
-                    raise TypeError(f"Unknown segmentation type {type(segmentation)}!")
-            else:
-                # Computing areas using bounding boxes
-                if to_bbox_mode == BoxMode.XYWH_ABS:
-                    bbox_xy = BoxMode.convert(bbox, to_bbox_mode, BoxMode.XYXY_ABS)
-                    area = Boxes([bbox_xy]).area()[0].item()
-                else:
-                    area = RotatedBoxes([bbox]).area()[0].item()
-            if "keypoints" in annotation:
-                keypoints = annotation["keypoints"]  # list[int]
-                for idx, v in enumerate(keypoints):
-                    if idx % 3 != 2:
-                        # COCO's segmentation coordinates are floating points in [0, H or W],
-                        # but keypoint coordinates are integers in [0, H-1 or W-1]
-                        # For COCO format consistency we substract 0.5
-                        # https://github.com/facebookresearch/detectron2/pull/175#issuecomment-551202163
-                        keypoints[idx] = v - 0.5
-                if "num_keypoints" in annotation:
-                    num_keypoints = annotation["num_keypoints"]
-                else:
-                    num_keypoints = sum(kp > 0 for kp in keypoints[2::3])
-            # COCO requirement:
-            #   linking annotations to images
-            #   "id" field must start with 1
-            coco_annotation["id"] = len(coco_annotations) + 1
-            coco_annotation["image_id"] = coco_image["id"]
-            coco_annotation["bbox"] = [round(float(x), 3) for x in bbox]
-            coco_annotation["area"] = float(area)
-            coco_annotation["iscrowd"] = int(annotation.get("iscrowd", 0))
-            coco_annotation["category_id"] = int(reverse_id_mapper(annotation["category_id"]))
-            # Add optional fields
-            if "keypoints" in annotation:
-                coco_annotation["keypoints"] = keypoints
-                coco_annotation["num_keypoints"] = num_keypoints
-            if "segmentation" in annotation:
-                seg = coco_annotation["segmentation"] = annotation["segmentation"]
-                if isinstance(seg, dict):  # RLE
-                    counts = seg["counts"]
-                    if not isinstance(counts, str):
-                        # make it json-serializable
-                        seg["counts"] = counts.decode("ascii")
-            coco_annotations.append(coco_annotation)
-    logger.info(
-        "Conversion finished, "
-        f"#images: {len(coco_images)}, #annotations: {len(coco_annotations)}"
-    )
-    info = {
-        "date_created": str(datetime.datetime.now()),
-        "description": "Automatically generated COCO json file for Detectron2.",
-    }
-    coco_dict = {"info": info, "images": coco_images, "categories": categories, "licenses": None}
-    if len(coco_annotations) > 0:
-        coco_dict["annotations"] = coco_annotations
-    return coco_dict
-def convert_to_coco_json(dataset_name, output_file, allow_cached=True):
-    """
-    Converts dataset into COCO format and saves it to a json file.
-    dataset_name must be registered in DatasetCatalog and in detectron2's standard format.
-    Args:
-        dataset_name:
-            reference from the config file to the catalogs
-            must be registered in DatasetCatalog and in detectron2's standard format
-        output_file: path of json file that will be saved to
-        allow_cached: if json file is already present then skip conversion
-    """
-    # TODO: The dataset or the conversion script *may* change,
-    # a checksum would be useful for validating the cached data
-    PathManager.mkdirs(os.path.dirname(output_file))
-    with file_lock(output_file):
-        if PathManager.exists(output_file) and allow_cached:
-            logger.warning(
-                f"Using previously cached COCO format annotations at '{output_file}'. "
-                "You need to clear the cache file if your dataset has been modified."
-            )
-        else:
-            logger.info(f"Converting annotations of dataset '{dataset_name}' to COCO format ...)")
-            coco_dict = convert_to_coco_dict(dataset_name)
-            logger.info(f"Caching COCO format annotations at '{output_file}' ...")
-            tmp_file = output_file + ".tmp"
-            with PathManager.open(tmp_file, "w") as f:
-                json.dump(coco_dict, f)
-            shutil.move(tmp_file, output_file)
-def register_coco_instances(name, metadata, json_file, image_root):
-    """
-    Register a dataset in COCO's json annotation format for
-    instance detection, instance segmentation and keypoint detection.
-    (i.e., Type 1 and 2 in http://cocodataset.org/#format-data.
-    `instances*.json` and `person_keypoints*.json` in the dataset).
-    This is an example of how to register a new dataset.
-    You can do something similar to this function, to register new datasets.
-    Args:
-        name (str): the name that identifies a dataset, e.g. "coco_2014_train".
-        metadata (dict): extra metadata associated with this dataset.  You can
-            leave it as an empty dict.
-        json_file (str): path to the json instance annotation file.
-        image_root (str or path-like): directory which contains all the images.
-    """
-    assert isinstance(name, str), name
-    assert isinstance(json_file, (str, os.PathLike)), json_file
-    assert isinstance(image_root, (str, os.PathLike)), image_root
-    # 1. register a function which returns dicts
-    DatasetCatalog.register(name, lambda: load_coco_json(json_file, image_root, name))
-    # 2. Optionally, add metadata about this dataset,
-    # since they might be useful in evaluation, visualization or logging
-    MetadataCatalog.get(name).set(
-        json_file=json_file, image_root=image_root, evaluator_type="coco", **metadata
-    )
-if __name__ == "__main__":
-    """
-    Test the COCO json dataset loader.
-    Usage:
-        python -m detectron2.data.datasets.coco \
-            path/to/json path/to/image_root dataset_name
-        "dataset_name" can be "coco_2014_minival_100", or other
-        pre-registered ones
-    """
-    from detectron2.utils.logger import setup_logger
-    from detectron2.utils.visualizer import Visualizer
-    import detectron2.data.datasets  # noqa # add pre-defined metadata
-    import sys
-    logger = setup_logger(name=__name__)
-    assert sys.argv[3] in DatasetCatalog.list()
-    meta = MetadataCatalog.get(sys.argv[3])
-    dicts = load_coco_json(sys.argv[1], sys.argv[2], sys.argv[3])
-    logger.info("Done loading {} samples.".format(len(dicts)))
-    dirname = "coco-data-vis"
-    os.makedirs(dirname, exist_ok=True)
-    for d in dicts:
-        img = np.array(Image.open(d["file_name"]))
-        visualizer = Visualizer(img, metadata=meta)
-        vis = visualizer.draw_dataset_dict(d)
-        fpath = os.path.join(dirname, os.path.basename(d["file_name"]))
-        vis.save(fpath)

cutler/data/detection_utils.py DELETED Viewed

@@ -1,650 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# Modified by XuDong Wang from https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/detection_utils.py
-"""
-Common data processing utilities that are used in a
-typical object detection data pipeline.
-"""
-import logging
-import numpy as np
-from typing import List, Union
-import pycocotools.mask as mask_util
-import torch
-from PIL import Image
-from detectron2.structures import (
-    Boxes,
-    BoxMode,
-    BitMasks,
-    Instances,
-    Keypoints,
-    PolygonMasks,
-    RotatedBoxes,
-    polygons_to_bitmask,
-)
-from detectron2.utils.file_io import PathManager
-from data import transforms as T
-from detectron2.data.catalog import MetadataCatalog
-__all__ = [
-    "SizeMismatchError",
-    "convert_image_to_rgb",
-    "check_image_size",
-    "transform_proposals",
-    "transform_instance_annotations",
-    "annotations_to_instances",
-    "annotations_to_instances_rotated",
-    "build_augmentation",
-    "build_transform_gen",
-    "create_keypoint_hflip_indices",
-    "filter_empty_instances",
-    "read_image",
-]
-class SizeMismatchError(ValueError):
-    """
-    When loaded image has difference width/height compared with annotation.
-    """
-# https://en.wikipedia.org/wiki/YUV#SDTV_with_BT.601
-_M_RGB2YUV = [[0.299, 0.587, 0.114], [-0.14713, -0.28886, 0.436], [0.615, -0.51499, -0.10001]]
-_M_YUV2RGB = [[1.0, 0.0, 1.13983], [1.0, -0.39465, -0.58060], [1.0, 2.03211, 0.0]]
-# https://www.exiv2.org/tags.html
-_EXIF_ORIENT = 274  # exif 'Orientation' tag
-def convert_PIL_to_numpy(image, format):
-    """
-    Convert PIL image to numpy array of target format.
-    Args:
-        image (PIL.Image): a PIL image
-        format (str): the format of output image
-    Returns:
-        (np.ndarray): also see `read_image`
-    """
-    if format is not None:
-        # PIL only supports RGB, so convert to RGB and flip channels over below
-        conversion_format = format
-        if format in ["BGR", "YUV-BT.601"]:
-            conversion_format = "RGB"
-        image = image.convert(conversion_format)
-    image = np.asarray(image)
-    # PIL squeezes out the channel dimension for "L", so make it HWC
-    if format == "L":
-        image = np.expand_dims(image, -1)
-    # handle formats not supported by PIL
-    elif format == "BGR":
-        # flip channels if needed
-        image = image[:, :, ::-1]
-    elif format == "YUV-BT.601":
-        image = image / 255.0
-        image = np.dot(image, np.array(_M_RGB2YUV).T)
-    return image
-def convert_image_to_rgb(image, format):
-    """
-    Convert an image from given format to RGB.
-    Args:
-        image (np.ndarray or Tensor): an HWC image
-        format (str): the format of input image, also see `read_image`
-    Returns:
-        (np.ndarray): (H,W,3) RGB image in 0-255 range, can be either float or uint8
-    """
-    if isinstance(image, torch.Tensor):
-        image = image.cpu().numpy()
-    if format == "BGR":
-        image = image[:, :, [2, 1, 0]]
-    elif format == "YUV-BT.601":
-        image = np.dot(image, np.array(_M_YUV2RGB).T)
-        image = image * 255.0
-    else:
-        if format == "L":
-            image = image[:, :, 0]
-        image = image.astype(np.uint8)
-        image = np.asarray(Image.fromarray(image, mode=format).convert("RGB"))
-    return image
-def _apply_exif_orientation(image):
-    """
-    Applies the exif orientation correctly.
-    This code exists per the bug:
-      https://github.com/python-pillow/Pillow/issues/3973
-    with the function `ImageOps.exif_transpose`. The Pillow source raises errors with
-    various methods, especially `tobytes`
-    Function based on:
-      https://github.com/wkentaro/labelme/blob/v4.5.4/labelme/utils/image.py#L59
-      https://github.com/python-pillow/Pillow/blob/7.1.2/src/PIL/ImageOps.py#L527
-    Args:
-        image (PIL.Image): a PIL image
-    Returns:
-        (PIL.Image): the PIL image with exif orientation applied, if applicable
-    """
-    if not hasattr(image, "getexif"):
-        return image
-    try:
-        exif = image.getexif()
-    except Exception:  # https://github.com/facebookresearch/detectron2/issues/1885
-        exif = None
-    if exif is None:
-        return image
-    orientation = exif.get(_EXIF_ORIENT)
-    method = {
-        2: Image.FLIP_LEFT_RIGHT,
-        3: Image.ROTATE_180,
-        4: Image.FLIP_TOP_BOTTOM,
-        5: Image.TRANSPOSE,
-        6: Image.ROTATE_270,
-        7: Image.TRANSVERSE,
-        8: Image.ROTATE_90,
-    }.get(orientation)
-    if method is not None:
-        return image.transpose(method)
-    return image
-def read_image(file_name, format=None):
-    """
-    Read an image into the given format.
-    Will apply rotation and flipping if the image has such exif information.
-    Args:
-        file_name (str): image file path
-        format (str): one of the supported image modes in PIL, or "BGR" or "YUV-BT.601".
-    Returns:
-        image (np.ndarray):
-            an HWC image in the given format, which is 0-255, uint8 for
-            supported image modes in PIL or "BGR"; float (0-1 for Y) for YUV-BT.601.
-    """
-    with PathManager.open(file_name, "rb") as f:
-        image = Image.open(f)
-        # work around this bug: https://github.com/python-pillow/Pillow/issues/3973
-        image = _apply_exif_orientation(image)
-        return convert_PIL_to_numpy(image, format)
-def check_image_size(dataset_dict, image):
-    """
-    Raise an error if the image does not match the size specified in the dict.
-    """
-    if "width" in dataset_dict or "height" in dataset_dict:
-        image_wh = (image.shape[1], image.shape[0])
-        expected_wh = (dataset_dict["width"], dataset_dict["height"])
-        if not image_wh == expected_wh:
-            expected_wh = (dataset_dict["height"], dataset_dict["width"])
-            dataset_dict["height"], dataset_dict["width"] = dataset_dict["width"], dataset_dict["height"]
-            if image_wh != expected_wh:
-                raise SizeMismatchError(
-                    "Mismatched image shape{}, got {}, expect {}.".format(
-                        " for image " + dataset_dict["file_name"]
-                        if "file_name" in dataset_dict
-                        else "",
-                        image_wh,
-                        expected_wh,
-                    )
-                    + " Please check the width/height in your annotation."
-                )
-    # To ensure bbox always remap to original image size
-    if "width" not in dataset_dict:
-        dataset_dict["width"] = image.shape[1]
-    if "height" not in dataset_dict:
-        dataset_dict["height"] = image.shape[0]
-def transform_proposals(dataset_dict, image_shape, transforms, *, proposal_topk, min_box_size=0):
-    """
-    Apply transformations to the proposals in dataset_dict, if any.
-    Args:
-        dataset_dict (dict): a dict read from the dataset, possibly
-            contains fields "proposal_boxes", "proposal_objectness_logits", "proposal_bbox_mode"
-        image_shape (tuple): height, width
-        transforms (TransformList):
-        proposal_topk (int): only keep top-K scoring proposals
-        min_box_size (int): proposals with either side smaller than this
-            threshold are removed
-    The input dict is modified in-place, with abovementioned keys removed. A new
-    key "proposals" will be added. Its value is an `Instances`
-    object which contains the transformed proposals in its field
-    "proposal_boxes" and "objectness_logits".
-    """
-    if "proposal_boxes" in dataset_dict:
-        # Transform proposal boxes
-        boxes = transforms.apply_box(
-            BoxMode.convert(
-                dataset_dict.pop("proposal_boxes"),
-                dataset_dict.pop("proposal_bbox_mode"),
-                BoxMode.XYXY_ABS,
-            )
-        )
-        boxes = Boxes(boxes)
-        objectness_logits = torch.as_tensor(
-            dataset_dict.pop("proposal_objectness_logits").astype("float32")
-        )
-        boxes.clip(image_shape)
-        keep = boxes.nonempty(threshold=min_box_size)
-        boxes = boxes[keep]
-        objectness_logits = objectness_logits[keep]
-        proposals = Instances(image_shape)
-        proposals.proposal_boxes = boxes[:proposal_topk]
-        proposals.objectness_logits = objectness_logits[:proposal_topk]
-        dataset_dict["proposals"] = proposals
-def transform_instance_annotations(
-    annotation, transforms, image_size, *, keypoint_hflip_indices=None
-):
-    """
-    Apply transforms to box, segmentation and keypoints annotations of a single instance.
-    It will use `transforms.apply_box` for the box, and
-    `transforms.apply_coords` for segmentation polygons & keypoints.
-    If you need anything more specially designed for each data structure,
-    you'll need to implement your own version of this function or the transforms.
-    Args:
-        annotation (dict): dict of instance annotations for a single instance.
-            It will be modified in-place.
-        transforms (TransformList or list[Transform]):
-        image_size (tuple): the height, width of the transformed image
-        keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`.
-    Returns:
-        dict:
-            the same input dict with fields "bbox", "segmentation", "keypoints"
-            transformed according to `transforms`.
-            The "bbox_mode" field will be set to XYXY_ABS.
-    """
-    if isinstance(transforms, (tuple, list)):
-        transforms = T.TransformList(transforms)
-    # bbox is 1d (per-instance bounding box)
-    bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS)
-    # clip transformed bbox to image size
-    bbox = transforms.apply_box(np.array([bbox]))[0].clip(min=0)
-    annotation["bbox"] = np.minimum(bbox, list(image_size + image_size)[::-1])
-    annotation["bbox_mode"] = BoxMode.XYXY_ABS
-    if "segmentation" in annotation:
-        # each instance contains 1 or more polygons
-        segm = annotation["segmentation"]
-        if isinstance(segm, list):
-            # polygons
-            polygons = [np.asarray(p).reshape(-1, 2) for p in segm]
-            annotation["segmentation"] = [
-                p.reshape(-1) for p in transforms.apply_polygons(polygons)
-            ]
-        elif isinstance(segm, dict):
-            # RLE
-            mask = mask_util.decode(segm)
-            mask = transforms.apply_segmentation(mask)
-            assert tuple(mask.shape[:2]) == image_size
-            annotation["segmentation"] = mask
-        else:
-            raise ValueError(
-                "Cannot transform segmentation of type '{}'!"
-                "Supported types are: polygons as list[list[float] or ndarray],"
-                " COCO-style RLE as a dict.".format(type(segm))
-            )
-    if "keypoints" in annotation:
-        keypoints = transform_keypoint_annotations(
-            annotation["keypoints"], transforms, image_size, keypoint_hflip_indices
-        )
-        annotation["keypoints"] = keypoints
-    return annotation
-def transform_keypoint_annotations(keypoints, transforms, image_size, keypoint_hflip_indices=None):
-    """
-    Transform keypoint annotations of an image.
-    If a keypoint is transformed out of image boundary, it will be marked "unlabeled" (visibility=0)
-    Args:
-        keypoints (list[float]): Nx3 float in Detectron2's Dataset format.
-            Each point is represented by (x, y, visibility).
-        transforms (TransformList):
-        image_size (tuple): the height, width of the transformed image
-        keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`.
-            When `transforms` includes horizontal flip, will use the index
-            mapping to flip keypoints.
-    """
-    # (N*3,) -> (N, 3)
-    keypoints = np.asarray(keypoints, dtype="float64").reshape(-1, 3)
-    keypoints_xy = transforms.apply_coords(keypoints[:, :2])
-    # Set all out-of-boundary points to "unlabeled"
-    inside = (keypoints_xy >= np.array([0, 0])) & (keypoints_xy <= np.array(image_size[::-1]))
-    inside = inside.all(axis=1)
-    keypoints[:, :2] = keypoints_xy
-    keypoints[:, 2][~inside] = 0
-    # This assumes that HorizFlipTransform is the only one that does flip
-    do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1
-    # Alternative way: check if probe points was horizontally flipped.
-    # probe = np.asarray([[0.0, 0.0], [image_width, 0.0]])
-    # probe_aug = transforms.apply_coords(probe.copy())
-    # do_hflip = np.sign(probe[1][0] - probe[0][0]) != np.sign(probe_aug[1][0] - probe_aug[0][0])  # noqa
-    # If flipped, swap each keypoint with its opposite-handed equivalent
-    if do_hflip:
-        if keypoint_hflip_indices is None:
-            raise ValueError("Cannot flip keypoints without providing flip indices!")
-        if len(keypoints) != len(keypoint_hflip_indices):
-            raise ValueError(
-                "Keypoint data has {} points, but metadata "
-                "contains {} points!".format(len(keypoints), len(keypoint_hflip_indices))
-            )
-        keypoints = keypoints[np.asarray(keypoint_hflip_indices, dtype=np.int32), :]
-    # Maintain COCO convention that if visibility == 0 (unlabeled), then x, y = 0
-    keypoints[keypoints[:, 2] == 0] = 0
-    return keypoints
-def annotations_to_instances(annos, image_size, mask_format="polygon"):
-    """
-    Create an :class:`Instances` object used by the models,
-    from instance annotations in the dataset dict.
-    Args:
-        annos (list[dict]): a list of instance annotations in one image, each
-            element for one instance.
-        image_size (tuple): height, width
-    Returns:
-        Instances:
-            It will contain fields "gt_boxes", "gt_classes",
-            "gt_masks", "gt_keypoints", if they can be obtained from `annos`.
-            This is the format that builtin models expect.
-    """
-    boxes = (
-        np.stack(
-            [BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos]
-        )
-        if len(annos)
-        else np.zeros((0, 4))
-    )
-    target = Instances(image_size)
-    target.gt_boxes = Boxes(boxes)
-    classes = [int(obj["category_id"]) for obj in annos]
-    classes = torch.tensor(classes, dtype=torch.int64)
-    target.gt_classes = classes
-    if len(annos) and "segmentation" in annos[0]:
-        segms = [obj["segmentation"] for obj in annos]
-        if mask_format == "polygon":
-            try:
-                masks = PolygonMasks(segms)
-            except ValueError as e:
-                raise ValueError(
-                    "Failed to use mask_format=='polygon' from the given annotations!"
-                ) from e
-        else:
-            assert mask_format == "bitmask", mask_format
-            masks = []
-            for segm in segms:
-                if isinstance(segm, list):
-                    # polygon
-                    masks.append(polygons_to_bitmask(segm, *image_size))
-                elif isinstance(segm, dict):
-                    # COCO RLE
-                    masks.append(mask_util.decode(segm))
-                elif isinstance(segm, np.ndarray):
-                    assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
-                        segm.ndim
-                    )
-                    # mask array
-                    masks.append(segm)
-                else:
-                    raise ValueError(
-                        "Cannot convert segmentation of type '{}' to BitMasks!"
-                        "Supported types are: polygons as list[list[float] or ndarray],"
-                        " COCO-style RLE as a dict, or a binary segmentation mask "
-                        " in a 2D numpy array of shape HxW.".format(type(segm))
-                    )
-            # torch.from_numpy does not support array with negative stride.
-            masks = BitMasks(
-                torch.stack([torch.from_numpy(np.ascontiguousarray(x)) for x in masks])
-            )
-        target.gt_masks = masks
-    if len(annos) and "keypoints" in annos[0]:
-        kpts = [obj.get("keypoints", []) for obj in annos]
-        target.gt_keypoints = Keypoints(kpts)
-    return target
-def annotations_to_instances_rotated(annos, image_size):
-    """
-    Create an :class:`Instances` object used by the models,
-    from instance annotations in the dataset dict.
-    Compared to `annotations_to_instances`, this function is for rotated boxes only
-    Args:
-        annos (list[dict]): a list of instance annotations in one image, each
-            element for one instance.
-        image_size (tuple): height, width
-    Returns:
-        Instances:
-            Containing fields "gt_boxes", "gt_classes",
-            if they can be obtained from `annos`.
-            This is the format that builtin models expect.
-    """
-    boxes = [obj["bbox"] for obj in annos]
-    target = Instances(image_size)
-    boxes = target.gt_boxes = RotatedBoxes(boxes)
-    boxes.clip(image_size)
-    classes = [obj["category_id"] for obj in annos]
-    classes = torch.tensor(classes, dtype=torch.int64)
-    target.gt_classes = classes
-    return target
-def filter_empty_instances(
-    instances, by_box=True, by_mask=True, box_threshold=1e-5, return_mask=False
-):
-    """
-    Filter out empty instances in an `Instances` object.
-    Args:
-        instances (Instances):
-        by_box (bool): whether to filter out instances with empty boxes
-        by_mask (bool): whether to filter out instances with empty masks
-        box_threshold (float): minimum width and height to be considered non-empty
-        return_mask (bool): whether to return boolean mask of filtered instances
-    Returns:
-        Instances: the filtered instances.
-        tensor[bool], optional: boolean mask of filtered instances
-    """
-    assert by_box or by_mask
-    r = []
-    if by_box:
-        r.append(instances.gt_boxes.nonempty(threshold=box_threshold))
-    if instances.has("gt_masks") and by_mask:
-        r.append(instances.gt_masks.nonempty())
-    # TODO: can also filter visible keypoints
-    if not r:
-        return instances
-    m = r[0]
-    for x in r[1:]:
-        m = m & x
-    if return_mask:
-        return instances[m], m
-    return instances[m]
-def create_keypoint_hflip_indices(dataset_names: Union[str, List[str]]) -> List[int]:
-    """
-    Args:
-        dataset_names: list of dataset names
-    Returns:
-        list[int]: a list of size=#keypoints, storing the
-        horizontally-flipped keypoint indices.
-    """
-    if isinstance(dataset_names, str):
-        dataset_names = [dataset_names]
-    check_metadata_consistency("keypoint_names", dataset_names)
-    check_metadata_consistency("keypoint_flip_map", dataset_names)
-    meta = MetadataCatalog.get(dataset_names[0])
-    names = meta.keypoint_names
-    # TODO flip -> hflip
-    flip_map = dict(meta.keypoint_flip_map)
-    flip_map.update({v: k for k, v in flip_map.items()})
-    flipped_names = [i if i not in flip_map else flip_map[i] for i in names]
-    flip_indices = [names.index(i) for i in flipped_names]
-    return flip_indices
-def get_fed_loss_cls_weights(dataset_names: Union[str, List[str]], freq_weight_power=1.0):
-    """
-    Get frequency weight for each class sorted by class id.
-    We now calcualte freqency weight using image_count to the power freq_weight_power.
-    Args:
-        dataset_names: list of dataset names
-        freq_weight_power: power value
-    """
-    if isinstance(dataset_names, str):
-        dataset_names = [dataset_names]
-    check_metadata_consistency("class_image_count", dataset_names)
-    meta = MetadataCatalog.get(dataset_names[0])
-    class_freq_meta = meta.class_image_count
-    class_freq = torch.tensor(
-        [c["image_count"] for c in sorted(class_freq_meta, key=lambda x: x["id"])]
-    )
-    class_freq_weight = class_freq.float() ** freq_weight_power
-    return class_freq_weight
-def gen_crop_transform_with_instance(crop_size, image_size, instance):
-    """
-    Generate a CropTransform so that the cropping region contains
-    the center of the given instance.
-    Args:
-        crop_size (tuple): h, w in pixels
-        image_size (tuple): h, w
-        instance (dict): an annotation dict of one instance, in Detectron2's
-            dataset format.
-    """
-    crop_size = np.asarray(crop_size, dtype=np.int32)
-    bbox = BoxMode.convert(instance["bbox"], instance["bbox_mode"], BoxMode.XYXY_ABS)
-    center_yx = (bbox[1] + bbox[3]) * 0.5, (bbox[0] + bbox[2]) * 0.5
-    assert (
-        image_size[0] >= center_yx[0] and image_size[1] >= center_yx[1]
-    ), "The annotation bounding box is outside of the image!"
-    assert (
-        image_size[0] >= crop_size[0] and image_size[1] >= crop_size[1]
-    ), "Crop size is larger than image size!"
-    min_yx = np.maximum(np.floor(center_yx).astype(np.int32) - crop_size, 0)
-    max_yx = np.maximum(np.asarray(image_size, dtype=np.int32) - crop_size, 0)
-    max_yx = np.minimum(max_yx, np.ceil(center_yx).astype(np.int32))
-    y0 = np.random.randint(min_yx[0], max_yx[0] + 1)
-    x0 = np.random.randint(min_yx[1], max_yx[1] + 1)
-    return T.CropTransform(x0, y0, crop_size[1], crop_size[0])
-def check_metadata_consistency(key, dataset_names):
-    """
-    Check that the datasets have consistent metadata.
-    Args:
-        key (str): a metadata key
-        dataset_names (list[str]): a list of dataset names
-    Raises:
-        AttributeError: if the key does not exist in the metadata
-        ValueError: if the given datasets do not have the same metadata values defined by key
-    """
-    if len(dataset_names) == 0:
-        return
-    logger = logging.getLogger(__name__)
-    entries_per_dataset = [getattr(MetadataCatalog.get(d), key) for d in dataset_names]
-    for idx, entry in enumerate(entries_per_dataset):
-        if entry != entries_per_dataset[0]:
-            logger.error(
-                "Metadata '{}' for dataset '{}' is '{}'".format(key, dataset_names[idx], str(entry))
-            )
-            logger.error(
-                "Metadata '{}' for dataset '{}' is '{}'".format(
-                    key, dataset_names[0], str(entries_per_dataset[0])
-                )
-            )
-            raise ValueError("Datasets have different metadata '{}'!".format(key))
-def build_augmentation(cfg, is_train):
-    """
-    Create a list of default :class:`Augmentation` from config.
-    Now it includes resizing and flipping.
-    Returns:
-        list[Augmentation]
-    """
-    if is_train:
-        min_size = cfg.INPUT.MIN_SIZE_TRAIN
-        max_size = cfg.INPUT.MAX_SIZE_TRAIN
-        sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
-    else:
-        min_size = cfg.INPUT.MIN_SIZE_TEST
-        max_size = cfg.INPUT.MAX_SIZE_TEST
-        sample_style = "choice"
-    augmentation = [T.ResizeShortestEdge(min_size, max_size, sample_style)]
-    if is_train and cfg.INPUT.RANDOM_FLIP != "none":
-        augmentation.append(
-            T.RandomFlip(
-                horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
-                vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
-            )
-        )
-    return augmentation
-build_transform_gen = build_augmentation
-"""
-Alias for backward-compatibility.
-"""

cutler/data/transforms/__init__.py DELETED Viewed

@@ -1,15 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# Modified by XuDong Wang from https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/transforms/__init__.py
-from fvcore.transforms.transform import *
-from .transform import *
-from detectron2.data.transforms.augmentation import *
-from .augmentation_impl import *
-__all__ = [k for k in globals().keys() if not k.startswith("_")]
-from detectron2.utils.env import fixup_module_metadata
-fixup_module_metadata(__name__, globals(), __all__)
-del fixup_module_metadata

cutler/data/transforms/augmentation_impl.py DELETED Viewed

@@ -1,616 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# Modified by XuDong Wang from https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/transforms/augmentation_impl.py
-"""
-Implement many useful :class:`Augmentation`.
-"""
-import numpy as np
-import sys
-from typing import Tuple
-import torch
-from fvcore.transforms.transform import (
-    BlendTransform,
-    CropTransform,
-    HFlipTransform,
-    NoOpTransform,
-    PadTransform,
-    Transform,
-    TransformList,
-    VFlipTransform,
-)
-from PIL import Image
-from detectron2.data.transforms.augmentation import Augmentation, _transform_to_aug
-from .transform import ExtentTransform, ResizeTransform, RotationTransform
-__all__ = [
-    "FixedSizeCrop",
-    "RandomApply",
-    "RandomBrightness",
-    "RandomContrast",
-    "RandomCrop",
-    "RandomExtent",
-    "RandomFlip",
-    "RandomSaturation",
-    "RandomLighting",
-    "RandomRotation",
-    "Resize",
-    "ResizeScale",
-    "ResizeShortestEdge",
-    "RandomCrop_CategoryAreaConstraint",
-]
-class RandomApply(Augmentation):
-    """
-    Randomly apply an augmentation with a given probability.
-    """
-    def __init__(self, tfm_or_aug, prob=0.5):
-        """
-        Args:
-            tfm_or_aug (Transform, Augmentation): the transform or augmentation
-                to be applied. It can either be a `Transform` or `Augmentation`
-                instance.
-            prob (float): probability between 0.0 and 1.0 that
-                the wrapper transformation is applied
-        """
-        super().__init__()
-        self.aug = _transform_to_aug(tfm_or_aug)
-        assert 0.0 <= prob <= 1.0, f"Probablity must be between 0.0 and 1.0 (given: {prob})"
-        self.prob = prob
-    def get_transform(self, *args):
-        do = self._rand_range() < self.prob
-        if do:
-            return self.aug.get_transform(*args)
-        else:
-            return NoOpTransform()
-    def __call__(self, aug_input):
-        do = self._rand_range() < self.prob
-        if do:
-            return self.aug(aug_input)
-        else:
-            return NoOpTransform()
-class RandomFlip(Augmentation):
-    """
-    Flip the image horizontally or vertically with the given probability.
-    """
-    def __init__(self, prob=0.5, *, horizontal=True, vertical=False):
-        """
-        Args:
-            prob (float): probability of flip.
-            horizontal (boolean): whether to apply horizontal flipping
-            vertical (boolean): whether to apply vertical flipping
-        """
-        super().__init__()
-        if horizontal and vertical:
-            raise ValueError("Cannot do both horiz and vert. Please use two Flip instead.")
-        if not horizontal and not vertical:
-            raise ValueError("At least one of horiz or vert has to be True!")
-        self._init(locals())
-    def get_transform(self, image):
-        h, w = image.shape[:2]
-        do = self._rand_range() < self.prob
-        if do:
-            if self.horizontal:
-                return HFlipTransform(w)
-            elif self.vertical:
-                return VFlipTransform(h)
-        else:
-            return NoOpTransform()
-class Resize(Augmentation):
-    """Resize image to a fixed target size"""
-    def __init__(self, shape, interp=Image.BILINEAR):
-        """
-        Args:
-            shape: (h, w) tuple or a int
-            interp: PIL interpolation method
-        """
-        if isinstance(shape, int):
-            shape = (shape, shape)
-        shape = tuple(shape)
-        self._init(locals())
-    def get_transform(self, image):
-        return ResizeTransform(
-            image.shape[0], image.shape[1], self.shape[0], self.shape[1], self.interp
-        )
-class ResizeShortestEdge(Augmentation):
-    """
-    Resize the image while keeping the aspect ratio unchanged.
-    It attempts to scale the shorter edge to the given `short_edge_length`,
-    as long as the longer edge does not exceed `max_size`.
-    If `max_size` is reached, then downscale so that the longer edge does not exceed max_size.
-    """
-    @torch.jit.unused
-    def __init__(
-        self, short_edge_length, max_size=sys.maxsize, sample_style="range", interp=Image.BILINEAR
-    ):
-        """
-        Args:
-            short_edge_length (list[int]): If ``sample_style=="range"``,
-                a [min, max] interval from which to sample the shortest edge length.
-                If ``sample_style=="choice"``, a list of shortest edge lengths to sample from.
-            max_size (int): maximum allowed longest edge length.
-            sample_style (str): either "range" or "choice".
-        """
-        super().__init__()
-        assert sample_style in ["range", "choice"], sample_style
-        self.is_range = sample_style == "range"
-        if isinstance(short_edge_length, int):
-            short_edge_length = (short_edge_length, short_edge_length)
-        if self.is_range:
-            assert len(short_edge_length) == 2, (
-                "short_edge_length must be two values using 'range' sample style."
-                f" Got {short_edge_length}!"
-            )
-        self._init(locals())
-    @torch.jit.unused
-    def get_transform(self, image):
-        h, w = image.shape[:2]
-        if self.is_range:
-            size = np.random.randint(self.short_edge_length[0], self.short_edge_length[1] + 1)
-        else:
-            size = np.random.choice(self.short_edge_length)
-        if size == 0:
-            return NoOpTransform()
-        newh, neww = ResizeShortestEdge.get_output_shape(h, w, size, self.max_size)
-        return ResizeTransform(h, w, newh, neww, self.interp)
-    @staticmethod
-    def get_output_shape(
-        oldh: int, oldw: int, short_edge_length: int, max_size: int
-    ) -> Tuple[int, int]:
-        """
-        Compute the output size given input size and target short edge length.
-        """
-        h, w = oldh, oldw
-        size = short_edge_length * 1.0
-        scale = size / min(h, w)
-        if h < w:
-            newh, neww = size, scale * w
-        else:
-            newh, neww = scale * h, size
-        if max(newh, neww) > max_size:
-            scale = max_size * 1.0 / max(newh, neww)
-            newh = newh * scale
-            neww = neww * scale
-        neww = int(neww + 0.5)
-        newh = int(newh + 0.5)
-        return (newh, neww)
-class ResizeScale(Augmentation):
-    """
-    Takes target size as input and randomly scales the given target size between `min_scale`
-    and `max_scale`. It then scales the input image such that it fits inside the scaled target
-    box, keeping the aspect ratio constant.
-    This implements the resize part of the Google's 'resize_and_crop' data augmentation:
-    https://github.com/tensorflow/tpu/blob/master/models/official/detection/utils/input_utils.py#L127
-    """
-    def __init__(
-        self,
-        min_scale: float,
-        max_scale: float,
-        target_height: int,
-        target_width: int,
-        interp: int = Image.BILINEAR,
-    ):
-        """
-        Args:
-            min_scale: minimum image scale range.
-            max_scale: maximum image scale range.
-            target_height: target image height.
-            target_width: target image width.
-            interp: image interpolation method.
-        """
-        super().__init__()
-        self._init(locals())
-    def _get_resize(self, image: np.ndarray, scale: float) -> Transform:
-        input_size = image.shape[:2]
-        # Compute new target size given a scale.
-        target_size = (self.target_height, self.target_width)
-        target_scale_size = np.multiply(target_size, scale)
-        # Compute actual rescaling applied to input image and output size.
-        output_scale = np.minimum(
-            target_scale_size[0] / input_size[0], target_scale_size[1] / input_size[1]
-        )
-        output_size = np.round(np.multiply(input_size, output_scale)).astype(int)
-        return ResizeTransform(
-            input_size[0], input_size[1], output_size[0], output_size[1], self.interp
-        )
-    def get_transform(self, image: np.ndarray) -> Transform:
-        random_scale = np.random.uniform(self.min_scale, self.max_scale)
-        return self._get_resize(image, random_scale)
-class RandomRotation(Augmentation):
-    """
-    This method returns a copy of this image, rotated the given
-    number of degrees counter clockwise around the given center.
-    """
-    def __init__(self, angle, expand=True, center=None, sample_style="range", interp=None):
-        """
-        Args:
-            angle (list[float]): If ``sample_style=="range"``,
-                a [min, max] interval from which to sample the angle (in degrees).
-                If ``sample_style=="choice"``, a list of angles to sample from
-            expand (bool): choose if the image should be resized to fit the whole
-                rotated image (default), or simply cropped
-            center (list[[float, float]]):  If ``sample_style=="range"``,
-                a [[minx, miny], [maxx, maxy]] relative interval from which to sample the center,
-                [0, 0] being the top left of the image and [1, 1] the bottom right.
-                If ``sample_style=="choice"``, a list of centers to sample from
-                Default: None, which means that the center of rotation is the center of the image
-                center has no effect if expand=True because it only affects shifting
-        """
-        super().__init__()
-        assert sample_style in ["range", "choice"], sample_style
-        self.is_range = sample_style == "range"
-        if isinstance(angle, (float, int)):
-            angle = (angle, angle)
-        if center is not None and isinstance(center[0], (float, int)):
-            center = (center, center)
-        self._init(locals())
-    def get_transform(self, image):
-        h, w = image.shape[:2]
-        center = None
-        if self.is_range:
-            angle = np.random.uniform(self.angle[0], self.angle[1])
-            if self.center is not None:
-                center = (
-                    np.random.uniform(self.center[0][0], self.center[1][0]),
-                    np.random.uniform(self.center[0][1], self.center[1][1]),
-                )
-        else:
-            angle = np.random.choice(self.angle)
-            if self.center is not None:
-                center = np.random.choice(self.center)
-        if center is not None:
-            center = (w * center[0], h * center[1])  # Convert to absolute coordinates
-        if angle % 360 == 0:
-            return NoOpTransform()
-        return RotationTransform(h, w, angle, expand=self.expand, center=center, interp=self.interp)
-class FixedSizeCrop(Augmentation):
-    """
-    If `crop_size` is smaller than the input image size, then it uses a random crop of
-    the crop size. If `crop_size` is larger than the input image size, then it pads
-    the right and the bottom of the image to the crop size if `pad` is True, otherwise
-    it returns the smaller image.
-    """
-    def __init__(self, crop_size: Tuple[int], pad: bool = True, pad_value: float = 128.0):
-        """
-        Args:
-            crop_size: target image (height, width).
-            pad: if True, will pad images smaller than `crop_size` up to `crop_size`
-            pad_value: the padding value.
-        """
-        super().__init__()
-        self._init(locals())
-    def _get_crop(self, image: np.ndarray) -> Transform:
-        # Compute the image scale and scaled size.
-        input_size = image.shape[:2]
-        output_size = self.crop_size
-        # Add random crop if the image is scaled up.
-        max_offset = np.subtract(input_size, output_size)
-        max_offset = np.maximum(max_offset, 0)
-        offset = np.multiply(max_offset, np.random.uniform(0.0, 1.0))
-        offset = np.round(offset).astype(int)
-        return CropTransform(
-            offset[1], offset[0], output_size[1], output_size[0], input_size[1], input_size[0]
-        )
-    def _get_pad(self, image: np.ndarray) -> Transform:
-        # Compute the image scale and scaled size.
-        input_size = image.shape[:2]
-        output_size = self.crop_size
-        # Add padding if the image is scaled down.
-        pad_size = np.subtract(output_size, input_size)
-        pad_size = np.maximum(pad_size, 0)
-        original_size = np.minimum(input_size, output_size)
-        return PadTransform(
-            0, 0, pad_size[1], pad_size[0], original_size[1], original_size[0], self.pad_value
-        )
-    def get_transform(self, image: np.ndarray) -> TransformList:
-        transforms = [self._get_crop(image)]
-        if self.pad:
-            transforms.append(self._get_pad(image))
-        return TransformList(transforms)
-class RandomCrop(Augmentation):
-    """
-    Randomly crop a rectangle region out of an image.
-    """
-    def __init__(self, crop_type: str, crop_size):
-        """
-        Args:
-            crop_type (str): one of "relative_range", "relative", "absolute", "absolute_range".
-            crop_size (tuple[float, float]): two floats, explained below.
-        - "relative": crop a (H * crop_size[0], W * crop_size[1]) region from an input image of
-          size (H, W). crop size should be in (0, 1]
-        - "relative_range": uniformly sample two values from [crop_size[0], 1]
-          and [crop_size[1]], 1], and use them as in "relative" crop type.
-        - "absolute" crop a (crop_size[0], crop_size[1]) region from input image.
-          crop_size must be smaller than the input image size.
-        - "absolute_range", for an input of size (H, W), uniformly sample H_crop in
-          [crop_size[0], min(H, crop_size[1])] and W_crop in [crop_size[0], min(W, crop_size[1])].
-          Then crop a region (H_crop, W_crop).
-        """
-        # TODO style of relative_range and absolute_range are not consistent:
-        # one takes (h, w) but another takes (min, max)
-        super().__init__()
-        assert crop_type in ["relative_range", "relative", "absolute", "absolute_range"]
-        self._init(locals())
-    def get_transform(self, image):
-        h, w = image.shape[:2]
-        croph, cropw = self.get_crop_size((h, w))
-        assert h >= croph and w >= cropw, "Shape computation in {} has bugs.".format(self)
-        h0 = np.random.randint(h - croph + 1)
-        w0 = np.random.randint(w - cropw + 1)
-        return CropTransform(w0, h0, cropw, croph)
-    def get_crop_size(self, image_size):
-        """
-        Args:
-            image_size (tuple): height, width
-        Returns:
-            crop_size (tuple): height, width in absolute pixels
-        """
-        h, w = image_size
-        if self.crop_type == "relative":
-            ch, cw = self.crop_size
-            return int(h * ch + 0.5), int(w * cw + 0.5)
-        elif self.crop_type == "relative_range":
-            crop_size = np.asarray(self.crop_size, dtype=np.float32)
-            ch, cw = crop_size + np.random.rand(2) * (1 - crop_size)
-            return int(h * ch + 0.5), int(w * cw + 0.5)
-        elif self.crop_type == "absolute":
-            return (min(self.crop_size[0], h), min(self.crop_size[1], w))
-        elif self.crop_type == "absolute_range":
-            assert self.crop_size[0] <= self.crop_size[1]
-            ch = np.random.randint(min(h, self.crop_size[0]), min(h, self.crop_size[1]) + 1)
-            cw = np.random.randint(min(w, self.crop_size[0]), min(w, self.crop_size[1]) + 1)
-            return ch, cw
-        else:
-            raise NotImplementedError("Unknown crop type {}".format(self.crop_type))
-class RandomCrop_CategoryAreaConstraint(Augmentation):
-    """
-    Similar to :class:`RandomCrop`, but find a cropping window such that no single category
-    occupies a ratio of more than `single_category_max_area` in semantic segmentation ground
-    truth, which can cause unstability in training. The function attempts to find such a valid
-    cropping window for at most 10 times.
-    """
-    def __init__(
-        self,
-        crop_type: str,
-        crop_size,
-        single_category_max_area: float = 1.0,
-        ignored_category: int = None,
-    ):
-        """
-        Args:
-            crop_type, crop_size: same as in :class:`RandomCrop`
-            single_category_max_area: the maximum allowed area ratio of a
-                category. Set to 1.0 to disable
-            ignored_category: allow this category in the semantic segmentation
-                ground truth to exceed the area ratio. Usually set to the category
-                that's ignored in training.
-        """
-        self.crop_aug = RandomCrop(crop_type, crop_size)
-        self._init(locals())
-    def get_transform(self, image, sem_seg):
-        if self.single_category_max_area >= 1.0:
-            return self.crop_aug.get_transform(image)
-        else:
-            h, w = sem_seg.shape
-            for _ in range(10):
-                crop_size = self.crop_aug.get_crop_size((h, w))
-                y0 = np.random.randint(h - crop_size[0] + 1)
-                x0 = np.random.randint(w - crop_size[1] + 1)
-                sem_seg_temp = sem_seg[y0 : y0 + crop_size[0], x0 : x0 + crop_size[1]]
-                labels, cnt = np.unique(sem_seg_temp, return_counts=True)
-                if self.ignored_category is not None:
-                    cnt = cnt[labels != self.ignored_category]
-                if len(cnt) > 1 and np.max(cnt) < np.sum(cnt) * self.single_category_max_area:
-                    break
-            crop_tfm = CropTransform(x0, y0, crop_size[1], crop_size[0])
-            return crop_tfm
-class RandomExtent(Augmentation):
-    """
-    Outputs an image by cropping a random "subrect" of the source image.
-    The subrect can be parameterized to include pixels outside the source image,
-    in which case they will be set to zeros (i.e. black). The size of the output
-    image will vary with the size of the random subrect.
-    """
-    def __init__(self, scale_range, shift_range):
-        """
-        Args:
-            output_size (h, w): Dimensions of output image
-            scale_range (l, h): Range of input-to-output size scaling factor
-            shift_range (x, y): Range of shifts of the cropped subrect. The rect
-                is shifted by [w / 2 * Uniform(-x, x), h / 2 * Uniform(-y, y)],
-                where (w, h) is the (width, height) of the input image. Set each
-                component to zero to crop at the image's center.
-        """
-        super().__init__()
-        self._init(locals())
-    def get_transform(self, image):
-        img_h, img_w = image.shape[:2]
-        # Initialize src_rect to fit the input image.
-        src_rect = np.array([-0.5 * img_w, -0.5 * img_h, 0.5 * img_w, 0.5 * img_h])
-        # Apply a random scaling to the src_rect.
-        src_rect *= np.random.uniform(self.scale_range[0], self.scale_range[1])
-        # Apply a random shift to the coordinates origin.
-        src_rect[0::2] += self.shift_range[0] * img_w * (np.random.rand() - 0.5)
-        src_rect[1::2] += self.shift_range[1] * img_h * (np.random.rand() - 0.5)
-        # Map src_rect coordinates into image coordinates (center at corner).
-        src_rect[0::2] += 0.5 * img_w
-        src_rect[1::2] += 0.5 * img_h
-        return ExtentTransform(
-            src_rect=(src_rect[0], src_rect[1], src_rect[2], src_rect[3]),
-            output_size=(int(src_rect[3] - src_rect[1]), int(src_rect[2] - src_rect[0])),
-        )
-class RandomContrast(Augmentation):
-    """
-    Randomly transforms image contrast.
-    Contrast intensity is uniformly sampled in (intensity_min, intensity_max).
-    - intensity < 1 will reduce contrast
-    - intensity = 1 will preserve the input image
-    - intensity > 1 will increase contrast
-    See: https://pillow.readthedocs.io/en/3.0.x/reference/ImageEnhance.html
-    """
-    def __init__(self, intensity_min, intensity_max):
-        """
-        Args:
-            intensity_min (float): Minimum augmentation
-            intensity_max (float): Maximum augmentation
-        """
-        super().__init__()
-        self._init(locals())
-    def get_transform(self, image):
-        w = np.random.uniform(self.intensity_min, self.intensity_max)
-        return BlendTransform(src_image=image.mean(), src_weight=1 - w, dst_weight=w)
-class RandomBrightness(Augmentation):
-    """
-    Randomly transforms image brightness.
-    Brightness intensity is uniformly sampled in (intensity_min, intensity_max).
-    - intensity < 1 will reduce brightness
-    - intensity = 1 will preserve the input image
-    - intensity > 1 will increase brightness
-    See: https://pillow.readthedocs.io/en/3.0.x/reference/ImageEnhance.html
-    """
-    def __init__(self, intensity_min, intensity_max):
-        """
-        Args:
-            intensity_min (float): Minimum augmentation
-            intensity_max (float): Maximum augmentation
-        """
-        super().__init__()
-        self._init(locals())
-    def get_transform(self, image):
-        w = np.random.uniform(self.intensity_min, self.intensity_max)
-        return BlendTransform(src_image=0, src_weight=1 - w, dst_weight=w)
-class RandomSaturation(Augmentation):
-    """
-    Randomly transforms saturation of an RGB image.
-    Input images are assumed to have 'RGB' channel order.
-    Saturation intensity is uniformly sampled in (intensity_min, intensity_max).
-    - intensity < 1 will reduce saturation (make the image more grayscale)
-    - intensity = 1 will preserve the input image
-    - intensity > 1 will increase saturation
-    See: https://pillow.readthedocs.io/en/3.0.x/reference/ImageEnhance.html
-    """
-    def __init__(self, intensity_min, intensity_max):
-        """
-        Args:
-            intensity_min (float): Minimum augmentation (1 preserves input).
-            intensity_max (float): Maximum augmentation (1 preserves input).
-        """
-        super().__init__()
-        self._init(locals())
-    def get_transform(self, image):
-        assert image.shape[-1] == 3, "RandomSaturation only works on RGB images"
-        w = np.random.uniform(self.intensity_min, self.intensity_max)
-        grayscale = image.dot([0.299, 0.587, 0.114])[:, :, np.newaxis]
-        return BlendTransform(src_image=grayscale, src_weight=1 - w, dst_weight=w)
-class RandomLighting(Augmentation):
-    """
-    The "lighting" augmentation described in AlexNet, using fixed PCA over ImageNet.
-    Input images are assumed to have 'RGB' channel order.
-    The degree of color jittering is randomly sampled via a normal distribution,
-    with standard deviation given by the scale parameter.
-    """
-    def __init__(self, scale):
-        """
-        Args:
-            scale (float): Standard deviation of principal component weighting.
-        """
-        super().__init__()
-        self._init(locals())
-        self.eigen_vecs = np.array(
-            [[-0.5675, 0.7192, 0.4009], [-0.5808, -0.0045, -0.8140], [-0.5836, -0.6948, 0.4203]]
-        )
-        self.eigen_vals = np.array([0.2175, 0.0188, 0.0045])
-    def get_transform(self, image):
-        assert image.shape[-1] == 3, "RandomLighting only works on RGB images"
-        weights = np.random.normal(scale=self.scale, size=3)
-        return BlendTransform(
-            src_image=self.eigen_vecs.dot(weights * self.eigen_vals), src_weight=1.0, dst_weight=1.0
-        )

cutler/data/transforms/transform.py DELETED Viewed

@@ -1,355 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# Modified by XuDong Wang from https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/transforms/transform.py
-"""
-See "Data Augmentation" tutorial for an overview of the system:
-https://detectron2.readthedocs.io/tutorials/augmentation.html
-"""
-import numpy as np
-import torch
-import torch.nn.functional as F
-from fvcore.transforms.transform import (
-    CropTransform,
-    HFlipTransform,
-    NoOpTransform,
-    Transform,
-    TransformList,
-)
-from PIL import Image
-try:
-    import cv2  # noqa
-except ImportError:
-    # OpenCV is an optional dependency at the moment
-    pass
-__all__ = [
-    "ExtentTransform",
-    "ResizeTransform",
-    "RotationTransform",
-    "ColorTransform",
-    "PILColorTransform",
-]
-class ExtentTransform(Transform):
-    """
-    Extracts a subregion from the source image and scales it to the output size.
-    The fill color is used to map pixels from the source rect that fall outside
-    the source image.
-    See: https://pillow.readthedocs.io/en/latest/PIL.html#PIL.ImageTransform.ExtentTransform
-    """
-    def __init__(self, src_rect, output_size, interp=Image.LINEAR, fill=0):
-        """
-        Args:
-            src_rect (x0, y0, x1, y1): src coordinates
-            output_size (h, w): dst image size
-            interp: PIL interpolation methods
-            fill: Fill color used when src_rect extends outside image
-        """
-        super().__init__()
-        self._set_attributes(locals())
-    def apply_image(self, img, interp=None):
-        h, w = self.output_size
-        if len(img.shape) > 2 and img.shape[2] == 1:
-            pil_image = Image.fromarray(img[:, :, 0], mode="L")
-        else:
-            pil_image = Image.fromarray(img)
-        pil_image = pil_image.transform(
-            size=(w, h),
-            method=Image.EXTENT,
-            data=self.src_rect,
-            resample=interp if interp else self.interp,
-            fill=self.fill,
-        )
-        ret = np.asarray(pil_image)
-        if len(img.shape) > 2 and img.shape[2] == 1:
-            ret = np.expand_dims(ret, -1)
-        return ret
-    def apply_coords(self, coords):
-        # Transform image center from source coordinates into output coordinates
-        # and then map the new origin to the corner of the output image.
-        h, w = self.output_size
-        x0, y0, x1, y1 = self.src_rect
-        new_coords = coords.astype(np.float32)
-        new_coords[:, 0] -= 0.5 * (x0 + x1)
-        new_coords[:, 1] -= 0.5 * (y0 + y1)
-        new_coords[:, 0] *= w / (x1 - x0)
-        new_coords[:, 1] *= h / (y1 - y0)
-        new_coords[:, 0] += 0.5 * w
-        new_coords[:, 1] += 0.5 * h
-        return new_coords
-    def apply_segmentation(self, segmentation):
-        segmentation = self.apply_image(segmentation, interp=Image.NEAREST)
-        return segmentation
-class ResizeTransform(Transform):
-    """
-    Resize the image to a target size.
-    """
-    def __init__(self, h, w, new_h, new_w, interp=None):
-        """
-        Args:
-            h, w (int): original image size
-            new_h, new_w (int): new image size
-            interp: PIL interpolation methods, defaults to bilinear.
-        """
-        # TODO decide on PIL vs opencv
-        super().__init__()
-        if interp is None:
-            interp = Image.BILINEAR
-        self._set_attributes(locals())
-    def apply_image(self, img, interp=None):
-        try:
-            img.shape[:2] == (self.h, self.w)
-        except:
-            (self.h, self.w) = (self.w, self.h)
-            assert img.shape[:2] == (self.h, self.w)
-        assert len(img.shape) <= 4
-        interp_method = interp if interp is not None else self.interp
-        if img.dtype == np.uint8:
-            if len(img.shape) > 2 and img.shape[2] == 1:
-                pil_image = Image.fromarray(img[:, :, 0], mode="L")
-            else:
-                pil_image = Image.fromarray(img)
-            pil_image = pil_image.resize((self.new_w, self.new_h), interp_method)
-            ret = np.asarray(pil_image)
-            if len(img.shape) > 2 and img.shape[2] == 1:
-                ret = np.expand_dims(ret, -1)
-        else:
-            # PIL only supports uint8
-            if any(x < 0 for x in img.strides):
-                img = np.ascontiguousarray(img)
-            img = torch.from_numpy(img)
-            shape = list(img.shape)
-            shape_4d = shape[:2] + [1] * (4 - len(shape)) + shape[2:]
-            img = img.view(shape_4d).permute(2, 3, 0, 1)  # hw(c) -> nchw
-            _PIL_RESIZE_TO_INTERPOLATE_MODE = {
-                Image.NEAREST: "nearest",
-                Image.BILINEAR: "bilinear",
-                Image.BICUBIC: "bicubic",
-            }
-            mode = _PIL_RESIZE_TO_INTERPOLATE_MODE[interp_method]
-            align_corners = None if mode == "nearest" else False
-            img = F.interpolate(
-                img, (self.new_h, self.new_w), mode=mode, align_corners=align_corners
-            )
-            shape[:2] = (self.new_h, self.new_w)
-            ret = img.permute(2, 3, 0, 1).view(shape).numpy()  # nchw -> hw(c)
-        return ret
-    def apply_coords(self, coords):
-        coords[:, 0] = coords[:, 0] * (self.new_w * 1.0 / self.w)
-        coords[:, 1] = coords[:, 1] * (self.new_h * 1.0 / self.h)
-        return coords
-    def apply_segmentation(self, segmentation):
-        segmentation = self.apply_image(segmentation, interp=Image.NEAREST)
-        return segmentation
-    def inverse(self):
-        return ResizeTransform(self.new_h, self.new_w, self.h, self.w, self.interp)
-class RotationTransform(Transform):
-    """
-    This method returns a copy of this image, rotated the given
-    number of degrees counter clockwise around its center.
-    """
-    def __init__(self, h, w, angle, expand=True, center=None, interp=None):
-        """
-        Args:
-            h, w (int): original image size
-            angle (float): degrees for rotation
-            expand (bool): choose if the image should be resized to fit the whole
-                rotated image (default), or simply cropped
-            center (tuple (width, height)): coordinates of the rotation center
-                if left to None, the center will be fit to the center of each image
-                center has no effect if expand=True because it only affects shifting
-            interp: cv2 interpolation method, default cv2.INTER_LINEAR
-        """
-        super().__init__()
-        image_center = np.array((w / 2, h / 2))
-        if center is None:
-            center = image_center
-        if interp is None:
-            interp = cv2.INTER_LINEAR
-        abs_cos, abs_sin = (abs(np.cos(np.deg2rad(angle))), abs(np.sin(np.deg2rad(angle))))
-        if expand:
-            # find the new width and height bounds
-            bound_w, bound_h = np.rint(
-                [h * abs_sin + w * abs_cos, h * abs_cos + w * abs_sin]
-            ).astype(int)
-        else:
-            bound_w, bound_h = w, h
-        self._set_attributes(locals())
-        self.rm_coords = self.create_rotation_matrix()
-        # Needed because of this problem https://github.com/opencv/opencv/issues/11784
-        self.rm_image = self.create_rotation_matrix(offset=-0.5)
-    def apply_image(self, img, interp=None):
-        """
-        img should be a numpy array, formatted as Height * Width * Nchannels
-        """
-        if len(img) == 0 or self.angle % 360 == 0:
-            return img
-        assert img.shape[:2] == (self.h, self.w)
-        interp = interp if interp is not None else self.interp
-        return cv2.warpAffine(img, self.rm_image, (self.bound_w, self.bound_h), flags=interp)
-    def apply_coords(self, coords):
-        """
-        coords should be a N * 2 array-like, containing N couples of (x, y) points
-        """
-        coords = np.asarray(coords, dtype=float)
-        if len(coords) == 0 or self.angle % 360 == 0:
-            return coords
-        return cv2.transform(coords[:, np.newaxis, :], self.rm_coords)[:, 0, :]
-    def apply_segmentation(self, segmentation):
-        segmentation = self.apply_image(segmentation, interp=cv2.INTER_NEAREST)
-        return segmentation
-    def create_rotation_matrix(self, offset=0):
-        center = (self.center[0] + offset, self.center[1] + offset)
-        rm = cv2.getRotationMatrix2D(tuple(center), self.angle, 1)
-        if self.expand:
-            # Find the coordinates of the center of rotation in the new image
-            # The only point for which we know the future coordinates is the center of the image
-            rot_im_center = cv2.transform(self.image_center[None, None, :] + offset, rm)[0, 0, :]
-            new_center = np.array([self.bound_w / 2, self.bound_h / 2]) + offset - rot_im_center
-            # shift the rotation center to the new coordinates
-            rm[:, 2] += new_center
-        return rm
-    def inverse(self):
-        """
-        The inverse is to rotate it back with expand, and crop to get the original shape.
-        """
-        if not self.expand:  # Not possible to inverse if a part of the image is lost
-            raise NotImplementedError()
-        rotation = RotationTransform(
-            self.bound_h, self.bound_w, -self.angle, True, None, self.interp
-        )
-        crop = CropTransform(
-            (rotation.bound_w - self.w) // 2, (rotation.bound_h - self.h) // 2, self.w, self.h
-        )
-        return TransformList([rotation, crop])
-class ColorTransform(Transform):
-    """
-    Generic wrapper for any photometric transforms.
-    These transformations should only affect the color space and
-        not the coordinate space of the image (e.g. annotation
-        coordinates such as bounding boxes should not be changed)
-    """
-    def __init__(self, op):
-        """
-        Args:
-            op (Callable): operation to be applied to the image,
-                which takes in an ndarray and returns an ndarray.
-        """
-        if not callable(op):
-            raise ValueError("op parameter should be callable")
-        super().__init__()
-        self._set_attributes(locals())
-    def apply_image(self, img):
-        return self.op(img)
-    def apply_coords(self, coords):
-        return coords
-    def inverse(self):
-        return NoOpTransform()
-    def apply_segmentation(self, segmentation):
-        return segmentation
-class PILColorTransform(ColorTransform):
-    """
-    Generic wrapper for PIL Photometric image transforms,
-        which affect the color space and not the coordinate
-        space of the image
-    """
-    def __init__(self, op):
-        """
-        Args:
-            op (Callable): operation to be applied to the image,
-                which takes in a PIL Image and returns a transformed
-                PIL Image.
-                For reference on possible operations see:
-                - https://pillow.readthedocs.io/en/stable/
-        """
-        if not callable(op):
-            raise ValueError("op parameter should be callable")
-        super().__init__(op)
-    def apply_image(self, img):
-        img = Image.fromarray(img)
-        return np.asarray(super().apply_image(img))
-def HFlip_rotated_box(transform, rotated_boxes):
-    """
-    Apply the horizontal flip transform on rotated boxes.
-    Args:
-        rotated_boxes (ndarray): Nx5 floating point array of
-            (x_center, y_center, width, height, angle_degrees) format
-            in absolute coordinates.
-    """
-    # Transform x_center
-    rotated_boxes[:, 0] = transform.width - rotated_boxes[:, 0]
-    # Transform angle
-    rotated_boxes[:, 4] = -rotated_boxes[:, 4]
-    return rotated_boxes
-def Resize_rotated_box(transform, rotated_boxes):
-    """
-    Apply the resizing transform on rotated boxes. For details of how these (approximation)
-    formulas are derived, please refer to :meth:`RotatedBoxes.scale`.
-    Args:
-        rotated_boxes (ndarray): Nx5 floating point array of
-            (x_center, y_center, width, height, angle_degrees) format
-            in absolute coordinates.
-    """
-    scale_factor_x = transform.new_w * 1.0 / transform.w
-    scale_factor_y = transform.new_h * 1.0 / transform.h
-    rotated_boxes[:, 0] *= scale_factor_x
-    rotated_boxes[:, 1] *= scale_factor_y
-    theta = rotated_boxes[:, 4] * np.pi / 180.0
-    c = np.cos(theta)
-    s = np.sin(theta)
-    rotated_boxes[:, 2] *= np.sqrt(np.square(scale_factor_x * c) + np.square(scale_factor_y * s))
-    rotated_boxes[:, 3] *= np.sqrt(np.square(scale_factor_x * s) + np.square(scale_factor_y * c))
-    rotated_boxes[:, 4] = np.arctan2(scale_factor_x * s, scale_factor_y * c) * 180 / np.pi
-    return rotated_boxes
-HFlipTransform.register_type("rotated_box", HFlip_rotated_box)
-ResizeTransform.register_type("rotated_box", Resize_rotated_box)
-# not necessary any more with latest fvcore
-NoOpTransform.register_type("rotated_box", lambda t, x: x)

cutler/demo/__init__.py DELETED Viewed

@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-from demo import *
-from predictor import *
-__all__ = [k for k in globals().keys() if not k.startswith("_")]

cutler/demo/demo.py DELETED Viewed

@@ -1,197 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# Modified by XuDong Wang from https://github.com/facebookresearch/detectron2/blob/main/demo/demo.py
-import argparse
-import glob
-import multiprocessing as mp
-import numpy as np
-import os
-import tempfile
-import time
-import warnings
-import cv2
-import tqdm
-from detectron2.config import get_cfg
-from detectron2.data.detection_utils import read_image
-from detectron2.utils.logger import setup_logger
-import sys
-sys.path.append('./')
-sys.path.append('../')
-from config import add_cutler_config
-from predictor import VisualizationDemo
-# constants
-WINDOW_NAME = "CutLER detections"
-def setup_cfg(args):
-    # load config from file and command-line arguments
-    cfg = get_cfg()
-    add_cutler_config(cfg)
-    cfg.merge_from_file(args.config_file)
-    cfg.merge_from_list(args.opts)
-    # Disable the use of SyncBN normalization when running on a CPU
-    # SyncBN is not supported on CPU and can cause errors, so we switch to BN instead
-    if cfg.MODEL.DEVICE == 'cpu' and cfg.MODEL.RESNETS.NORM == 'SyncBN':
-        cfg.MODEL.RESNETS.NORM = "BN"
-        cfg.MODEL.FPN.NORM = "BN"
-    # Set score_threshold for builtin models
-    cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold
-    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold
-    cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = args.confidence_threshold
-    cfg.freeze()
-    return cfg
-def get_parser():
-    parser = argparse.ArgumentParser(description="Detectron2 demo for builtin configs")
-    parser.add_argument(
-        "--config-file",
-        default="model_zoo/configs/CutLER-ImageNet/cascade_mask_rcnn_R_50_FPN.yaml",
-        metavar="FILE",
-        help="path to config file",
-    )
-    parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.")
-    parser.add_argument("--video-input", help="Path to video file.")
-    parser.add_argument(
-        "--input",
-        nargs="+",
-        help="A list of space separated input images; "
-        "or a single glob pattern such as 'directory/*.jpg'",
-    )
-    parser.add_argument(
-        "--output",
-        help="A file or directory to save output visualizations. "
-        "If not given, will show output in an OpenCV window.",
-    )
-    parser.add_argument(
-        "--confidence-threshold",
-        type=float,
-        default=0.35,
-        help="Minimum score for instance predictions to be shown",
-    )
-    parser.add_argument(
-        "--opts",
-        help="Modify config options using the command-line 'KEY VALUE' pairs",
-        default=[],
-        nargs=argparse.REMAINDER,
-    )
-    return parser
-def test_opencv_video_format(codec, file_ext):
-    with tempfile.TemporaryDirectory(prefix="video_format_test") as dir:
-        filename = os.path.join(dir, "test_file" + file_ext)
-        writer = cv2.VideoWriter(
-            filename=filename,
-            fourcc=cv2.VideoWriter_fourcc(*codec),
-            fps=float(30),
-            frameSize=(10, 10),
-            isColor=True,
-        )
-        [writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)]
-        writer.release()
-        if os.path.isfile(filename):
-            return True
-        return False
-if __name__ == "__main__":
-    mp.set_start_method("spawn", force=True)
-    args = get_parser().parse_args()
-    setup_logger(name="fvcore")
-    logger = setup_logger()
-    logger.info("Arguments: " + str(args))
-    cfg = setup_cfg(args)
-    demo = VisualizationDemo(cfg)
-    if args.input:
-        if len(args.input) == 1:
-            args.input = glob.glob(os.path.expanduser(args.input[0]))
-            assert args.input, "The input path(s) was not found"
-        for path in tqdm.tqdm(args.input, disable=not args.output):
-            # use PIL, to be consistent with evaluation
-            img = read_image(path, format="BGR")
-            start_time = time.time()
-            predictions, visualized_output = demo.run_on_image(img)
-            logger.info(
-                "{}: {} in {:.2f}s".format(
-                    path,
-                    "detected {} instances".format(len(predictions["instances"]))
-                    if "instances" in predictions
-                    else "finished",
-                    time.time() - start_time,
-                )
-            )
-            if args.output:
-                if os.path.isdir(args.output):
-                    assert os.path.isdir(args.output), args.output
-                    out_filename = os.path.join(args.output, os.path.basename(path))
-                else:
-                    assert len(args.input) == 1, "Please specify a directory with args.output"
-                    out_filename = args.output
-                visualized_output.save(out_filename)
-            else:
-                cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
-                cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
-                if cv2.waitKey(0) == 27:
-                    break  # esc to quit
-    elif args.webcam:
-        assert args.input is None, "Cannot have both --input and --webcam!"
-        assert args.output is None, "output not yet supported with --webcam!"
-        cam = cv2.VideoCapture(0)
-        for vis in tqdm.tqdm(demo.run_on_video(cam)):
-            cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
-            cv2.imshow(WINDOW_NAME, vis)
-            if cv2.waitKey(1) == 27:
-                break  # esc to quit
-        cam.release()
-        cv2.destroyAllWindows()
-    elif args.video_input:
-        video = cv2.VideoCapture(args.video_input)
-        width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
-        height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        frames_per_second = video.get(cv2.CAP_PROP_FPS)
-        num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
-        basename = os.path.basename(args.video_input)
-        codec, file_ext = (
-            ("x264", ".mkv") if test_opencv_video_format("x264", ".mkv") else ("mp4v", ".mp4")
-        )
-        if codec == ".mp4v":
-            warnings.warn("x264 codec not available, switching to mp4v")
-        if args.output:
-            if os.path.isdir(args.output):
-                output_fname = os.path.join(args.output, basename)
-                output_fname = os.path.splitext(output_fname)[0] + file_ext
-            else:
-                output_fname = args.output
-            assert not os.path.isfile(output_fname), output_fname
-            output_file = cv2.VideoWriter(
-                filename=output_fname,
-                # some installation of opencv may not support x264 (due to its license),
-                # you can try other format (e.g. MPEG)
-                fourcc=cv2.VideoWriter_fourcc(*codec),
-                fps=float(frames_per_second),
-                frameSize=(width, height),
-                isColor=True,
-            )
-        assert os.path.isfile(args.video_input)
-        for vis_frame in tqdm.tqdm(demo.run_on_video(video), total=num_frames):
-            if args.output:
-                output_file.write(vis_frame)
-            else:
-                cv2.namedWindow(basename, cv2.WINDOW_NORMAL)
-                cv2.imshow(basename, vis_frame)
-                if cv2.waitKey(1) == 27:
-                    break  # esc to quit
-        video.release()
-        if args.output:
-            output_file.release()
-        else:
-            cv2.destroyAllWindows()

cutler/demo/predictor.py DELETED Viewed

@@ -1,219 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-import atexit
-import bisect
-import multiprocessing as mp
-from collections import deque
-import cv2
-import torch
-from detectron2.data import MetadataCatalog
-import sys
-sys.path.append('./')
-from engine.defaults import DefaultPredictor
-from detectron2.utils.video_visualizer import VideoVisualizer
-from detectron2.utils.visualizer import ColorMode, Visualizer
-class VisualizationDemo(object):
-    def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
-        """
-        Args:
-            cfg (CfgNode):
-            instance_mode (ColorMode):
-            parallel (bool): whether to run the model in different processes from visualization.
-                Useful since the visualization logic can be slow.
-        """
-        self.metadata = MetadataCatalog.get(
-            cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
-        )
-        self.cpu_device = torch.device("cpu")
-        self.instance_mode = instance_mode
-        self.parallel = parallel
-        if parallel:
-            num_gpu = torch.cuda.device_count()
-            self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
-        else:
-            self.predictor = DefaultPredictor(cfg)
-    def run_on_image(self, image):
-        """
-        Args:
-            image (np.ndarray): an image of shape (H, W, C) (in BGR order).
-                This is the format used by OpenCV.
-        Returns:
-            predictions (dict): the output of the model.
-            vis_output (VisImage): the visualized image output.
-        """
-        vis_output = None
-        predictions = self.predictor(image)
-        # Convert image from OpenCV BGR format to Matplotlib RGB format.
-        image = image[:, :, ::-1]
-        visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode)
-        if "panoptic_seg" in predictions:
-            panoptic_seg, segments_info = predictions["panoptic_seg"]
-            vis_output = visualizer.draw_panoptic_seg_predictions(
-                panoptic_seg.to(self.cpu_device), segments_info
-            )
-        else:
-            if "sem_seg" in predictions:
-                vis_output = visualizer.draw_sem_seg(
-                    predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
-                )
-            if "instances" in predictions:
-                instances = predictions["instances"].to(self.cpu_device)
-                vis_output = visualizer.draw_instance_predictions(predictions=instances)
-        return predictions, vis_output
-    def _frame_from_video(self, video):
-        while video.isOpened():
-            success, frame = video.read()
-            if success:
-                yield frame
-            else:
-                break
-    def run_on_video(self, video):
-        """
-        Visualizes predictions on frames of the input video.
-        Args:
-            video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be
-                either a webcam or a video file.
-        Yields:
-            ndarray: BGR visualizations of each video frame.
-        """
-        video_visualizer = VideoVisualizer(self.metadata, self.instance_mode)
-        def process_predictions(frame, predictions):
-            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-            if "panoptic_seg" in predictions:
-                panoptic_seg, segments_info = predictions["panoptic_seg"]
-                vis_frame = video_visualizer.draw_panoptic_seg_predictions(
-                    frame, panoptic_seg.to(self.cpu_device), segments_info
-                )
-            elif "instances" in predictions:
-                predictions = predictions["instances"].to(self.cpu_device)
-                vis_frame = video_visualizer.draw_instance_predictions(frame, predictions)
-            elif "sem_seg" in predictions:
-                vis_frame = video_visualizer.draw_sem_seg(
-                    frame, predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
-                )
-            # Converts Matplotlib RGB format to OpenCV BGR format
-            vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR)
-            return vis_frame
-        frame_gen = self._frame_from_video(video)
-        if self.parallel:
-            buffer_size = self.predictor.default_buffer_size
-            frame_data = deque()
-            for cnt, frame in enumerate(frame_gen):
-                frame_data.append(frame)
-                self.predictor.put(frame)
-                if cnt >= buffer_size:
-                    frame = frame_data.popleft()
-                    predictions = self.predictor.get()
-                    yield process_predictions(frame, predictions)
-            while len(frame_data):
-                frame = frame_data.popleft()
-                predictions = self.predictor.get()
-                yield process_predictions(frame, predictions)
-        else:
-            for frame in frame_gen:
-                yield process_predictions(frame, self.predictor(frame))
-class AsyncPredictor:
-    """
-    A predictor that runs the model asynchronously, possibly on >1 GPUs.
-    Because rendering the visualization takes considerably amount of time,
-    this helps improve throughput a little bit when rendering videos.
-    """
-    class _StopToken:
-        pass
-    class _PredictWorker(mp.Process):
-        def __init__(self, cfg, task_queue, result_queue):
-            self.cfg = cfg
-            self.task_queue = task_queue
-            self.result_queue = result_queue
-            super().__init__()
-        def run(self):
-            predictor = DefaultPredictor(self.cfg)
-            while True:
-                task = self.task_queue.get()
-                if isinstance(task, AsyncPredictor._StopToken):
-                    break
-                idx, data = task
-                result = predictor(data)
-                self.result_queue.put((idx, result))
-    def __init__(self, cfg, num_gpus: int = 1):
-        """
-        Args:
-            cfg (CfgNode):
-            num_gpus (int): if 0, will run on CPU
-        """
-        num_workers = max(num_gpus, 1)
-        self.task_queue = mp.Queue(maxsize=num_workers * 3)
-        self.result_queue = mp.Queue(maxsize=num_workers * 3)
-        self.procs = []
-        for gpuid in range(max(num_gpus, 1)):
-            cfg = cfg.clone()
-            cfg.defrost()
-            cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu"
-            self.procs.append(
-                AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)
-            )
-        self.put_idx = 0
-        self.get_idx = 0
-        self.result_rank = []
-        self.result_data = []
-        for p in self.procs:
-            p.start()
-        atexit.register(self.shutdown)
-    def put(self, image):
-        self.put_idx += 1
-        self.task_queue.put((self.put_idx, image))
-    def get(self):
-        self.get_idx += 1  # the index needed for this request
-        if len(self.result_rank) and self.result_rank[0] == self.get_idx:
-            res = self.result_data[0]
-            del self.result_data[0], self.result_rank[0]
-            return res
-        while True:
-            # make sure the results are returned in the correct order
-            idx, res = self.result_queue.get()
-            if idx == self.get_idx:
-                return res
-            insert = bisect.bisect(self.result_rank, idx)
-            self.result_rank.insert(insert, idx)
-            self.result_data.insert(insert, res)
-    def __len__(self):
-        return self.put_idx - self.get_idx
-    def __call__(self, image):
-        self.put(image)
-        return self.get()
-    def shutdown(self):
-        for _ in self.procs:
-            self.task_queue.put(AsyncPredictor._StopToken())
-    @property
-    def default_buffer_size(self):
-        return len(self.procs) * 5

cutler/engine/__init__.py DELETED Viewed

@@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-from .train_loop import *
-__all__ = [k for k in globals().keys() if not k.startswith("_")]
-from .defaults import *

cutler/engine/defaults.py DELETED Viewed

@@ -1,726 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# Modified by XuDong Wang from https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/defaults.py
-"""
-This file contains components with some default boilerplate logic user may need
-in training / testing. They will not work for everyone, but many users may find them useful.
-The behavior of functions/classes in this file is subject to change,
-since they are meant to represent the "common default behavior" people need in their projects.
-"""
-import argparse
-import logging
-import os
-import sys
-import weakref
-from collections import OrderedDict
-from typing import Optional
-import torch
-from fvcore.nn.precise_bn import get_bn_modules
-from omegaconf import OmegaConf
-from torch.nn.parallel import DistributedDataParallel
-import data.transforms as T
-from detectron2.checkpoint import DetectionCheckpointer
-from detectron2.config import CfgNode, LazyConfig
-from detectron2.data import (
-    MetadataCatalog,
-)
-from data import (
-    build_detection_test_loader,
-    build_detection_train_loader,
-)
-from detectron2.evaluation import (
-    DatasetEvaluator,
-    inference_on_dataset,
-    print_csv_format,
-    verify_results,
-)
-from modeling import build_model
-from solver import build_lr_scheduler, build_optimizer
-from detectron2.utils import comm
-from detectron2.utils.collect_env import collect_env_info
-from detectron2.utils.env import seed_all_rng
-from detectron2.utils.events import CommonMetricPrinter, JSONWriter, TensorboardXWriter
-from detectron2.utils.file_io import PathManager
-from detectron2.utils.logger import setup_logger
-from detectron2.engine import hooks
-from detectron2.engine import TrainerBase
-from .train_loop import CustomAMPTrainer, CustomSimpleTrainer
-__all__ = [
-    "create_ddp_model",
-    "default_argument_parser",
-    "default_setup",
-    "default_writers",
-    "DefaultPredictor",
-    "DefaultTrainer",
-]
-def create_ddp_model(model, *, fp16_compression=False, **kwargs):
-    """
-    Create a DistributedDataParallel model if there are >1 processes.
-    Args:
-        model: a torch.nn.Module
-        fp16_compression: add fp16 compression hooks to the ddp object.
-            See more at https://pytorch.org/docs/stable/ddp_comm_hooks.html#torch.distributed.algorithms.ddp_comm_hooks.default_hooks.fp16_compress_hook
-        kwargs: other arguments of :module:`torch.nn.parallel.DistributedDataParallel`.
-    """  # noqa
-    if comm.get_world_size() == 1:
-        return model
-    if "device_ids" not in kwargs:
-        kwargs["device_ids"] = [comm.get_local_rank()]
-    ddp = DistributedDataParallel(model, **kwargs)
-    if fp16_compression:
-        from torch.distributed.algorithms.ddp_comm_hooks import default as comm_hooks
-        ddp.register_comm_hook(state=None, hook=comm_hooks.fp16_compress_hook)
-    return ddp
-def default_argument_parser(epilog=None):
-    """
-    Create a parser with some common arguments used by detectron2 users.
-    Args:
-        epilog (str): epilog passed to ArgumentParser describing the usage.
-    Returns:
-        argparse.ArgumentParser:
-    """
-    parser = argparse.ArgumentParser(
-        epilog=epilog
-        or f"""
-Examples:
-Run on single machine:
-    $ {sys.argv[0]} --num-gpus 8 --config-file cfg.yaml
-Change some config options:
-    $ {sys.argv[0]} --config-file cfg.yaml MODEL.WEIGHTS /path/to/weight.pth SOLVER.BASE_LR 0.001
-Run on multiple machines:
-    (machine0)$ {sys.argv[0]} --machine-rank 0 --num-machines 2 --dist-url <URL> [--other-flags]
-    (machine1)$ {sys.argv[0]} --machine-rank 1 --num-machines 2 --dist-url <URL> [--other-flags]
-""",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-    )
-    parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file")
-    parser.add_argument(
-        "--resume",
-        action="store_true",
-        help="Whether to attempt to resume from the checkpoint directory. "
-        "See documentation of `DefaultTrainer.resume_or_load()` for what it means.",
-    )
-    parser.add_argument("--eval-only", action="store_true", help="perform evaluation only")
-    parser.add_argument("--num-gpus", type=int, default=1, help="number of gpus *per machine*")
-    parser.add_argument("--num-machines", type=int, default=1, help="total number of machines")
-    parser.add_argument(
-        "--machine-rank", type=int, default=0, help="the rank of this machine (unique per machine)"
-    )
-    parser.add_argument(
-        "--test-dataset", type=str, default="", help="the dataset used for evaluation"
-    )
-    parser.add_argument(
-        "--train-dataset", type=str, default="", help="the dataset used for training"
-    )
-    parser.add_argument("--no-segm", action="store_true", help="perform evaluation on detection only")
-    # PyTorch still may leave orphan processes in multi-gpu training.
-    # Therefore we use a deterministic way to obtain port,
-    # so that users are aware of orphan processes by seeing the port occupied.
-    port = 2**15 + 2**14 + hash(os.getuid() if sys.platform != "win32" else 1) % 2**14
-    parser.add_argument(
-        "--dist-url",
-        default="tcp://127.0.0.1:{}".format(port),
-        help="initialization URL for pytorch distributed backend. See "
-        "https://pytorch.org/docs/stable/distributed.html for details.",
-    )
-    parser.add_argument(
-        "opts",
-        help="""
-Modify config options at the end of the command. For Yacs configs, use
-space-separated "PATH.KEY VALUE" pairs.
-For python-based LazyConfig, use "path.key=value".
-        """.strip(),
-        default=None,
-        nargs=argparse.REMAINDER,
-    )
-    return parser
-def _try_get_key(cfg, *keys, default=None):
-    """
-    Try select keys from cfg until the first key that exists. Otherwise return default.
-    """
-    if isinstance(cfg, CfgNode):
-        cfg = OmegaConf.create(cfg.dump())
-    for k in keys:
-        none = object()
-        p = OmegaConf.select(cfg, k, default=none)
-        if p is not none:
-            return p
-    return default
-def _highlight(code, filename):
-    try:
-        import pygments
-    except ImportError:
-        return code
-    from pygments.lexers import Python3Lexer, YamlLexer
-    from pygments.formatters import Terminal256Formatter
-    lexer = Python3Lexer() if filename.endswith(".py") else YamlLexer()
-    code = pygments.highlight(code, lexer, Terminal256Formatter(style="monokai"))
-    return code
-def default_setup(cfg, args):
-    """
-    Perform some basic common setups at the beginning of a job, including:
-    1. Set up the detectron2 logger
-    2. Log basic information about environment, cmdline arguments, and config
-    3. Backup the config to the output directory
-    Args:
-        cfg (CfgNode or omegaconf.DictConfig): the full config to be used
-        args (argparse.NameSpace): the command line arguments to be logged
-    """
-    output_dir = _try_get_key(cfg, "OUTPUT_DIR", "output_dir", "train.output_dir")
-    if comm.is_main_process() and output_dir:
-        PathManager.mkdirs(output_dir)
-    rank = comm.get_rank()
-    setup_logger(output_dir, distributed_rank=rank, name="fvcore")
-    logger = setup_logger(output_dir, distributed_rank=rank)
-    logger.info("Rank of current process: {}. World size: {}".format(rank, comm.get_world_size()))
-    logger.info("Environment info:\n" + collect_env_info())
-    logger.info("Command line arguments: " + str(args))
-    if hasattr(args, "config_file") and args.config_file != "":
-        logger.info(
-            "Contents of args.config_file={}:\n{}".format(
-                args.config_file,
-                _highlight(PathManager.open(args.config_file, "r").read(), args.config_file),
-            )
-        )
-    if comm.is_main_process() and output_dir:
-        # Note: some of our scripts may expect the existence of
-        # config.yaml in output directory
-        path = os.path.join(output_dir, "config.yaml")
-        if isinstance(cfg, CfgNode):
-            logger.info("Running with full config:\n{}".format(_highlight(cfg.dump(), ".yaml")))
-            with PathManager.open(path, "w") as f:
-                f.write(cfg.dump())
-        else:
-            LazyConfig.save(cfg, path)
-        logger.info("Full config saved to {}".format(path))
-    # make sure each worker has a different, yet deterministic seed if specified
-    seed = _try_get_key(cfg, "SEED", "train.seed", default=-1)
-    seed_all_rng(None if seed < 0 else seed + rank)
-    # cudnn benchmark has large overhead. It shouldn't be used considering the small size of
-    # typical validation set.
-    if not (hasattr(args, "eval_only") and args.eval_only):
-        torch.backends.cudnn.benchmark = _try_get_key(
-            cfg, "CUDNN_BENCHMARK", "train.cudnn_benchmark", default=False
-        )
-def default_writers(output_dir: str, max_iter: Optional[int] = None):
-    """
-    Build a list of :class:`EventWriter` to be used.
-    It now consists of a :class:`CommonMetricPrinter`,
-    :class:`TensorboardXWriter` and :class:`JSONWriter`.
-    Args:
-        output_dir: directory to store JSON metrics and tensorboard events
-        max_iter: the total number of iterations
-    Returns:
-        list[EventWriter]: a list of :class:`EventWriter` objects.
-    """
-    PathManager.mkdirs(output_dir)
-    return [
-        # It may not always print what you want to see, since it prints "common" metrics only.
-        CommonMetricPrinter(max_iter),
-        JSONWriter(os.path.join(output_dir, "metrics.json")),
-        TensorboardXWriter(output_dir),
-    ]
-class DefaultPredictor:
-    """
-    Create a simple end-to-end predictor with the given config that runs on
-    single device for a single input image.
-    Compared to using the model directly, this class does the following additions:
-    1. Load checkpoint from `cfg.MODEL.WEIGHTS`.
-    2. Always take BGR image as the input and apply conversion defined by `cfg.INPUT.FORMAT`.
-    3. Apply resizing defined by `cfg.INPUT.{MIN,MAX}_SIZE_TEST`.
-    4. Take one input image and produce a single output, instead of a batch.
-    This is meant for simple demo purposes, so it does the above steps automatically.
-    This is not meant for benchmarks or running complicated inference logic.
-    If you'd like to do anything more complicated, please refer to its source code as
-    examples to build and use the model manually.
-    Attributes:
-        metadata (Metadata): the metadata of the underlying dataset, obtained from
-            cfg.DATASETS.TEST.
-    Examples:
-    ::
-        pred = DefaultPredictor(cfg)
-        inputs = cv2.imread("input.jpg")
-        outputs = pred(inputs)
-    """
-    def __init__(self, cfg):
-        self.cfg = cfg.clone()  # cfg can be modified by model
-        self.model = build_model(self.cfg)
-        self.model.eval()
-        if len(cfg.DATASETS.TEST):
-            self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])
-        checkpointer = DetectionCheckpointer(self.model)
-        checkpointer.load(cfg.MODEL.WEIGHTS)
-        self.aug = T.ResizeShortestEdge(
-            [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST
-        )
-        self.input_format = cfg.INPUT.FORMAT
-        assert self.input_format in ["RGB", "BGR"], self.input_format
-    def __call__(self, original_image):
-        """
-        Args:
-            original_image (np.ndarray): an image of shape (H, W, C) (in BGR order).
-        Returns:
-            predictions (dict):
-                the output of the model for one image only.
-                See :doc:`/tutorials/models` for details about the format.
-        """
-        with torch.no_grad():  # https://github.com/sphinx-doc/sphinx/issues/4258
-            # Apply pre-processing to image.
-            if self.input_format == "RGB":
-                # whether the model expects BGR inputs or RGB
-                original_image = original_image[:, :, ::-1]
-            height, width = original_image.shape[:2]
-            image = self.aug.get_transform(original_image).apply_image(original_image)
-            image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
-            inputs = {"image": image, "height": height, "width": width}
-            predictions = self.model([inputs])[0]
-            return predictions
-class DefaultTrainer(TrainerBase):
-    """
-    A trainer with default training logic. It does the following:
-    1. Create a :class:`SimpleTrainer` using model, optimizer, dataloader
-       defined by the given config. Create a LR scheduler defined by the config.
-    2. Load the last checkpoint or `cfg.MODEL.WEIGHTS`, if exists, when
-       `resume_or_load` is called.
-    3. Register a few common hooks defined by the config.
-    It is created to simplify the **standard model training workflow** and reduce code boilerplate
-    for users who only need the standard training workflow, with standard features.
-    It means this class makes *many assumptions* about your training logic that
-    may easily become invalid in a new research. In fact, any assumptions beyond those made in the
-    :class:`SimpleTrainer` are too much for research.
-    The code of this class has been annotated about restrictive assumptions it makes.
-    When they do not work for you, you're encouraged to:
-    1. Overwrite methods of this class, OR:
-    2. Use :class:`SimpleTrainer`, which only does minimal SGD training and
-       nothing else. You can then add your own hooks if needed. OR:
-    3. Write your own training loop similar to `tools/plain_train_net.py`.
-    See the :doc:`/tutorials/training` tutorials for more details.
-    Note that the behavior of this class, like other functions/classes in
-    this file, is not stable, since it is meant to represent the "common default behavior".
-    It is only guaranteed to work well with the standard models and training workflow in detectron2.
-    To obtain more stable behavior, write your own training logic with other public APIs.
-    Examples:
-    ::
-        trainer = DefaultTrainer(cfg)
-        trainer.resume_or_load()  # load last checkpoint or MODEL.WEIGHTS
-        trainer.train()
-    Attributes:
-        scheduler:
-        checkpointer (DetectionCheckpointer):
-        cfg (CfgNode):
-    """
-    def __init__(self, cfg):
-        """
-        Args:
-            cfg (CfgNode):
-        """
-        super().__init__()
-        logger = logging.getLogger("detectron2")
-        if not logger.isEnabledFor(logging.INFO):  # setup_logger is not called for d2
-            setup_logger()
-        cfg = DefaultTrainer.auto_scale_workers(cfg, comm.get_world_size())
-        # Assume these objects must be constructed in this order.
-        model = self.build_model(cfg)
-        optimizer = self.build_optimizer(cfg, model)
-        data_loader = self.build_train_loader(cfg)
-        model = create_ddp_model(model, broadcast_buffers=False)
-        if cfg.SOLVER.AMP.ENABLED:
-            self._trainer = CustomAMPTrainer(model, data_loader, optimizer, cfg=cfg)
-        else:
-            self._trainer = CustomSimpleTrainer(model, data_loader, optimizer, cfg=cfg)
-        self.scheduler = self.build_lr_scheduler(cfg, optimizer)
-        self.checkpointer = DetectionCheckpointer(
-            # Assume you want to save checkpoints together with logs/statistics
-            model,
-            cfg.OUTPUT_DIR,
-            trainer=weakref.proxy(self),
-        )
-        self.start_iter = 0
-        self.max_iter = cfg.SOLVER.MAX_ITER
-        self.cfg = cfg
-        self.register_hooks(self.build_hooks())
-    def resume_or_load(self, resume=True):
-        """
-        If `resume==True` and `cfg.OUTPUT_DIR` contains the last checkpoint (defined by
-        a `last_checkpoint` file), resume from the file. Resuming means loading all
-        available states (eg. optimizer and scheduler) and update iteration counter
-        from the checkpoint. ``cfg.MODEL.WEIGHTS`` will not be used.
-        Otherwise, this is considered as an independent training. The method will load model
-        weights from the file `cfg.MODEL.WEIGHTS` (but will not load other states) and start
-        from iteration 0.
-        Args:
-            resume (bool): whether to do resume or not
-        """
-        self.checkpointer.resume_or_load(self.cfg.MODEL.WEIGHTS, resume=resume)
-        if resume and self.checkpointer.has_checkpoint():
-            # The checkpoint stores the training iteration that just finished, thus we start
-            # at the next iteration
-            self.start_iter = self.iter + 1
-    def build_hooks(self):
-        """
-        Build a list of default hooks, including timing, evaluation,
-        checkpointing, lr scheduling, precise BN, writing events.
-        Returns:
-            list[HookBase]:
-        """
-        cfg = self.cfg.clone()
-        cfg.defrost()
-        cfg.DATALOADER.NUM_WORKERS = 0  # save some memory and time for PreciseBN
-        ret = [
-            hooks.IterationTimer(),
-            hooks.LRScheduler(),
-            hooks.PreciseBN(
-                # Run at the same freq as (but before) evaluation.
-                cfg.TEST.EVAL_PERIOD,
-                self.model,
-                # Build a new data loader to not affect training
-                self.build_train_loader(cfg),
-                cfg.TEST.PRECISE_BN.NUM_ITER,
-            )
-            if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model)
-            else None,
-        ]
-        # Do PreciseBN before checkpointer, because it updates the model and need to
-        # be saved by checkpointer.
-        # This is not always the best: if checkpointing has a different frequency,
-        # some checkpoints may have more precise statistics than others.
-        if comm.is_main_process():
-            ret.append(hooks.PeriodicCheckpointer(self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD))
-        def test_and_save_results():
-            self._last_eval_results = self.test(self.cfg, self.model)
-            return self._last_eval_results
-        # Do evaluation after checkpointer, because then if it fails,
-        # we can use the saved checkpoint to debug.
-        ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results))
-        if comm.is_main_process():
-            # Here the default print/log frequency of each writer is used.
-            # run writers in the end, so that evaluation metrics are written
-            ret.append(hooks.PeriodicWriter(self.build_writers(), period=20))
-        return ret
-    def build_writers(self):
-        """
-        Build a list of writers to be used using :func:`default_writers()`.
-        If you'd like a different list of writers, you can overwrite it in
-        your trainer.
-        Returns:
-            list[EventWriter]: a list of :class:`EventWriter` objects.
-        """
-        return default_writers(self.cfg.OUTPUT_DIR, self.max_iter)
-    def train(self):
-        """
-        Run training.
-        Returns:
-            OrderedDict of results, if evaluation is enabled. Otherwise None.
-        """
-        super().train(self.start_iter, self.max_iter)
-        if len(self.cfg.TEST.EXPECTED_RESULTS) and comm.is_main_process():
-            assert hasattr(
-                self, "_last_eval_results"
-            ), "No evaluation results obtained during training!"
-            verify_results(self.cfg, self._last_eval_results)
-            return self._last_eval_results
-    def run_step(self):
-        self._trainer.iter = self.iter
-        self._trainer.run_step()
-    def state_dict(self):
-        ret = super().state_dict()
-        ret["_trainer"] = self._trainer.state_dict()
-        return ret
-    def load_state_dict(self, state_dict):
-        super().load_state_dict(state_dict)
-        self._trainer.load_state_dict(state_dict["_trainer"])
-    @classmethod
-    def build_model(cls, cfg):
-        """
-        Returns:
-            torch.nn.Module:
-        It now calls :func:`detectron2.modeling.build_model`.
-        Overwrite it if you'd like a different model.
-        """
-        model = build_model(cfg)
-        logger = logging.getLogger(__name__)
-        logger.info("Model:\n{}".format(model))
-        return model
-    @classmethod
-    def build_optimizer(cls, cfg, model):
-        """
-        Returns:
-            torch.optim.Optimizer:
-        It now calls :func:`detectron2.solver.build_optimizer`.
-        Overwrite it if you'd like a different optimizer.
-        """
-        return build_optimizer(cfg, model)
-    @classmethod
-    def build_lr_scheduler(cls, cfg, optimizer):
-        """
-        It now calls :func:`detectron2.solver.build_lr_scheduler`.
-        Overwrite it if you'd like a different scheduler.
-        """
-        return build_lr_scheduler(cfg, optimizer)
-    @classmethod
-    def build_train_loader(cls, cfg):
-        """
-        Returns:
-            iterable
-        It now calls :func:`detectron2.data.build_detection_train_loader`.
-        Overwrite it if you'd like a different data loader.
-        """
-        return build_detection_train_loader(cfg)
-    @classmethod
-    def build_test_loader(cls, cfg, dataset_name):
-        """
-        Returns:
-            iterable
-        It now calls :func:`detectron2.data.build_detection_test_loader`.
-        Overwrite it if you'd like a different data loader.
-        """
-        return build_detection_test_loader(cfg, dataset_name)
-    @classmethod
-    def build_evaluator(cls, cfg, dataset_name):
-        """
-        Returns:
-            DatasetEvaluator or None
-        It is not implemented by default.
-        """
-        raise NotImplementedError(
-            """
-If you want DefaultTrainer to automatically run evaluation,
-please implement `build_evaluator()` in subclasses (see train_net.py for example).
-Alternatively, you can call evaluation functions yourself (see Colab balloon tutorial for example).
-"""
-        )
-    @classmethod
-    def test(cls, cfg, model, evaluators=None):
-        """
-        Evaluate the given model. The given model is expected to already contain
-        weights to evaluate.
-        Args:
-            cfg (CfgNode):
-            model (nn.Module):
-            evaluators (list[DatasetEvaluator] or None): if None, will call
-                :meth:`build_evaluator`. Otherwise, must have the same length as
-                ``cfg.DATASETS.TEST``.
-        Returns:
-            dict: a dict of result metrics
-        """
-        logger = logging.getLogger(__name__)
-        if isinstance(evaluators, DatasetEvaluator):
-            evaluators = [evaluators]
-        if evaluators is not None:
-            assert len(cfg.DATASETS.TEST) == len(evaluators), "{} != {}".format(
-                len(cfg.DATASETS.TEST), len(evaluators)
-            )
-        results = OrderedDict()
-        for idx, dataset_name in enumerate(cfg.DATASETS.TEST):
-            data_loader = cls.build_test_loader(cfg, dataset_name)
-            # When evaluators are passed in as arguments,
-            # implicitly assume that evaluators can be created before data_loader.
-            if evaluators is not None:
-                evaluator = evaluators[idx]
-            else:
-                try:
-                    evaluator = cls.build_evaluator(cfg, dataset_name)
-                except NotImplementedError:
-                    logger.warn(
-                        "No evaluator found. Use `DefaultTrainer.test(evaluators=)`, "
-                        "or implement its `build_evaluator` method."
-                    )
-                    results[dataset_name] = {}
-                    continue
-            results_i = inference_on_dataset(model, data_loader, evaluator)
-            results[dataset_name] = results_i
-            if comm.is_main_process():
-                assert isinstance(
-                    results_i, dict
-                ), "Evaluator must return a dict on the main process. Got {} instead.".format(
-                    results_i
-                )
-                logger.info("Evaluation results for {} in csv format:".format(dataset_name))
-                print_csv_format(results_i)
-        if len(results) == 1:
-            results = list(results.values())[0]
-        return results
-    @staticmethod
-    def auto_scale_workers(cfg, num_workers: int):
-        """
-        When the config is defined for certain number of workers (according to
-        ``cfg.SOLVER.REFERENCE_WORLD_SIZE``) that's different from the number of
-        workers currently in use, returns a new cfg where the total batch size
-        is scaled so that the per-GPU batch size stays the same as the
-        original ``IMS_PER_BATCH // REFERENCE_WORLD_SIZE``.
-        Other config options are also scaled accordingly:
-        * training steps and warmup steps are scaled inverse proportionally.
-        * learning rate are scaled proportionally, following :paper:`ImageNet in 1h`.
-        For example, with the original config like the following:
-        .. code-block:: yaml
-            IMS_PER_BATCH: 16
-            BASE_LR: 0.1
-            REFERENCE_WORLD_SIZE: 8
-            MAX_ITER: 5000
-            STEPS: (4000,)
-            CHECKPOINT_PERIOD: 1000
-        When this config is used on 16 GPUs instead of the reference number 8,
-        calling this method will return a new config with:
-        .. code-block:: yaml
-            IMS_PER_BATCH: 32
-            BASE_LR: 0.2
-            REFERENCE_WORLD_SIZE: 16
-            MAX_ITER: 2500
-            STEPS: (2000,)
-            CHECKPOINT_PERIOD: 500
-        Note that both the original config and this new config can be trained on 16 GPUs.
-        It's up to user whether to enable this feature (by setting ``REFERENCE_WORLD_SIZE``).
-        Returns:
-            CfgNode: a new config. Same as original if ``cfg.SOLVER.REFERENCE_WORLD_SIZE==0``.
-        """
-        old_world_size = cfg.SOLVER.REFERENCE_WORLD_SIZE
-        if old_world_size == 0 or old_world_size == num_workers:
-            return cfg
-        cfg = cfg.clone()
-        frozen = cfg.is_frozen()
-        cfg.defrost()
-        assert (
-            cfg.SOLVER.IMS_PER_BATCH % old_world_size == 0
-        ), "Invalid REFERENCE_WORLD_SIZE in config!"
-        scale = num_workers / old_world_size
-        bs = cfg.SOLVER.IMS_PER_BATCH = int(round(cfg.SOLVER.IMS_PER_BATCH * scale))
-        lr = cfg.SOLVER.BASE_LR = cfg.SOLVER.BASE_LR * scale
-        max_iter = cfg.SOLVER.MAX_ITER = int(round(cfg.SOLVER.MAX_ITER / scale))
-        warmup_iter = cfg.SOLVER.WARMUP_ITERS = int(round(cfg.SOLVER.WARMUP_ITERS / scale))
-        cfg.SOLVER.STEPS = tuple(int(round(s / scale)) for s in cfg.SOLVER.STEPS)
-        cfg.TEST.EVAL_PERIOD = int(round(cfg.TEST.EVAL_PERIOD / scale))
-        cfg.SOLVER.CHECKPOINT_PERIOD = int(round(cfg.SOLVER.CHECKPOINT_PERIOD / scale))
-        cfg.SOLVER.REFERENCE_WORLD_SIZE = num_workers  # maintain invariant
-        logger = logging.getLogger(__name__)
-        logger.info(
-            f"Auto-scaling the config to batch_size={bs}, learning_rate={lr}, "
-            f"max_iter={max_iter}, warmup={warmup_iter}."
-        )
-        if frozen:
-            cfg.freeze()
-        return cfg
-# Access basic attributes from the underlying trainer
-for _attr in ["model", "data_loader", "optimizer"]:
-    setattr(
-        DefaultTrainer,
-        _attr,
-        property(
-            # getter
-            lambda self, x=_attr: getattr(self._trainer, x),
-            # setter
-            lambda self, value, x=_attr: setattr(self._trainer, x, value),
-        ),
-    )

cutler/engine/train_loop.py DELETED Viewed

@@ -1,360 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# Modified by XuDong Wang from https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/train_loop.py and https://github.com/NVlabs/FreeSOLO/tree/main/freesolo/engine/trainer.py
-import torch
-from torch.nn.parallel import DataParallel, DistributedDataParallel
-import numpy as np
-import time
-import torch
-from torch.nn.parallel import DataParallel, DistributedDataParallel
-import copy
-import random
-import torch.nn.functional as F
-from detectron2.structures.instances import Instances
-from detectron2.structures import BitMasks
-from detectron2.engine import SimpleTrainer
-__all__ = ["CustomSimpleTrainer", "CustomAMPTrainer"]
-class CustomSimpleTrainer(SimpleTrainer):
-    """
-    A simple trainer for the most common type of task:
-    single-cost single-optimizer single-data-source iterative optimization,
-    optionally using data-parallelism.
-    It assumes that every step, you:
-    1. Compute the loss with a data from the data_loader.
-    2. Compute the gradients with the above loss.
-    3. Update the model with the optimizer.
-    All other tasks during training (checkpointing, logging, evaluation, LR schedule)
-    are maintained by hooks, which can be registered by :meth:`TrainerBase.register_hooks`.
-    If you want to do anything fancier than this,
-    either subclass TrainerBase and implement your own `run_step`,
-    or write your own training loop.
-    """
-    def __init__(self, model, data_loader, optimizer, cfg=None, use_copy_paste=False,
-                copy_paste_rate=-1, copy_paste_random_num=None, copy_paste_min_ratio=-1,
-                copy_paste_max_ratio=-1, visualize_copy_paste=False):
-        """
-        Args:
-            model: a torch Module. Takes a data from data_loader and returns a
-                dict of losses.
-            data_loader: an iterable. Contains data to be used to call model.
-            optimizer: a torch optimizer.
-        """
-        super().__init__(model, data_loader, optimizer)
-        """
-        We set the model to training mode in the trainer.
-        However it's valid to train a model that's in eval mode.
-        If you want your model (or a submodule of it) to behave
-        like evaluation during training, you can overwrite its train() method.
-        """
-        self.cfg = cfg
-        # model.train()
-        # self.model = model
-        # self.data_loader = data_loader
-        # to access the data loader iterator, call `self._data_loader_iter`
-        # self._data_loader_iter_obj = None
-        # self.optimizer = optimizer
-        self.use_copy_paste = use_copy_paste if self.cfg is None else self.cfg.DATALOADER.COPY_PASTE
-        self.cfg_COPY_PASTE_RATE = copy_paste_rate if self.cfg is None else self.cfg.DATALOADER.COPY_PASTE_RATE
-        self.cfg_COPY_PASTE_RANDOM_NUM = copy_paste_random_num if self.cfg is None else self.cfg.DATALOADER.COPY_PASTE_RANDOM_NUM
-        self.cfg_COPY_PASTE_MIN_RATIO = copy_paste_min_ratio if self.cfg is None else self.cfg.DATALOADER.COPY_PASTE_MIN_RATIO
-        self.cfg_COPY_PASTE_MAX_RATIO = copy_paste_max_ratio if self.cfg is None else self.cfg.DATALOADER.COPY_PASTE_MAX_RATIO
-        self.cfg_VISUALIZE_COPY_PASTE = visualize_copy_paste if self.cfg is None else self.cfg.DATALOADER.VISUALIZE_COPY_PASTE
-    def IoU(self, mask1, mask2): # only work when the batch size is 1
-        mask1, mask2 = (mask1>0.5).to(torch.bool), (mask2>0.5).to(torch.bool)
-        intersection = torch.sum(mask1 * (mask1 == mask2), dim=[-1, -2]).squeeze()
-        union = torch.sum(mask1 + mask2, dim=[-1, -2]).squeeze()
-        return (intersection.to(torch.float) / union).mean().view(1, -1)
-    def IoY(self, mask1, mask2): # only work when the batch size is 1
-        # print(mask1.size(), mask2.size())
-        mask1, mask2 = mask1.squeeze(), mask2.squeeze()
-        mask1, mask2 = (mask1>0.5).to(torch.bool), (mask2>0.5).to(torch.bool)
-        intersection = torch.sum(mask1 * (mask1 == mask2), dim=[-1, -2]).squeeze()
-        union = torch.sum(mask2, dim=[-1, -2]).squeeze()
-        return (intersection.to(torch.float) / union).mean().view(1, -1)
-    def copy_and_paste(self, labeled_data, unlabeled_data):
-        new_unlabeled_data = []
-        def mask_iou_matrix(x, y, mode='iou'):
-            x = x.reshape(x.shape[0], -1).float()
-            y = y.reshape(y.shape[0], -1).float()
-            inter_matrix = x @ y.transpose(1, 0) # n1xn2
-            sum_x = x.sum(1)[:, None].expand(x.shape[0], y.shape[0])
-            sum_y = y.sum(1)[None, :].expand(x.shape[0], y.shape[0])
-            if mode == 'ioy':
-                iou_matrix = inter_matrix / (sum_y) # [1, 1]
-            else:
-                iou_matrix = inter_matrix / (sum_x + sum_y - inter_matrix) # [1, 1]
-            return iou_matrix
-        def visualize_data(data, save_path = './sample.jpg'):
-            from data import detection_utils as utils
-            from detectron2.data import DatasetCatalog, MetadataCatalog
-            from detectron2.utils.visualizer import Visualizer
-            data["instances"] = data["instances"].to(device='cpu')
-            img = data["image"].permute(1, 2, 0).cpu().detach().numpy()
-            img = utils.convert_image_to_rgb(img, 'RGB')
-            metadata = MetadataCatalog.get('imagenet_train_tau0.15')
-            visualizer = Visualizer(img, metadata=metadata, scale=1.0)
-            target_fields = data["instances"].get_fields()
-            labels = [metadata.thing_classes[i] for i in target_fields["gt_classes"]]
-            vis = visualizer.overlay_instances(
-                    labels=labels,
-                    boxes=target_fields.get("gt_boxes"), # ("gt_boxes", None),
-                    masks=target_fields.get("gt_masks"), # ("gt_masks", None),
-                    keypoints=target_fields.get("gt_keypoints", None),
-            )
-            print("Saving to {} ...".format(save_path))
-            vis.save(save_path)
-        for cur_labeled_data, cur_unlabeled_data in zip(labeled_data, unlabeled_data):
-            cur_labeled_instances = cur_labeled_data["instances"]
-            cur_labeled_image = cur_labeled_data["image"]
-            cur_unlabeled_instances = cur_unlabeled_data["instances"]
-            cur_unlabeled_image = cur_unlabeled_data["image"]
-            num_labeled_instances = len(cur_labeled_instances)
-            copy_paste_rate = random.random()
-            if self.cfg_COPY_PASTE_RATE >= copy_paste_rate and num_labeled_instances > 0:
-                if self.cfg_COPY_PASTE_RANDOM_NUM:
-                    num_copy = 1 if num_labeled_instances == 1 else np.random.randint(1, max(1, num_labeled_instances))
-                else:
-                    num_copy = num_labeled_instances
-            else:
-                num_copy = 0
-            if num_labeled_instances == 0 or num_copy == 0:
-                new_unlabeled_data.append(cur_unlabeled_data)
-            else:
-                # print("num_labeled_instances, num_copy: ", num_labeled_instances, num_copy)
-                choice = np.random.choice(num_labeled_instances, num_copy, replace=False)
-                copied_instances = cur_labeled_instances[choice].to(device=cur_unlabeled_instances.gt_boxes.device)
-                copied_masks = copied_instances.gt_masks
-                copied_boxes = copied_instances.gt_boxes
-                _, labeled_h, labeled_w = cur_labeled_image.shape
-                _, unlabeled_h, unlabeled_w = cur_unlabeled_image.shape
-                # rescale the labeled image to align with unlabeled one.
-                if isinstance(copied_masks, torch.Tensor):
-                    masks_new = copied_masks[None, ...].float()
-                else:
-                    masks_new = copied_masks.tensor[None, ...].float()
-                # resize the masks with a random ratio from 0.5 to 1.0
-                resize_ratio = random.uniform(self.cfg_COPY_PASTE_MIN_RATIO, self.cfg_COPY_PASTE_MAX_RATIO)
-                w_new = int(resize_ratio * unlabeled_w)
-                h_new = int(resize_ratio * unlabeled_h)
-                w_shift = random.randint(0, unlabeled_w - w_new)
-                h_shift = random.randint(0, unlabeled_h - h_new)
-                cur_labeled_image_new = F.interpolate(cur_labeled_image[None, ...].float(), size=(h_new, w_new), mode="bilinear", align_corners=False).byte().squeeze(0)
-                if isinstance(copied_masks, torch.Tensor):
-                    masks_new = F.interpolate(copied_masks[None, ...].float(), size=(h_new, w_new), mode="bilinear", align_corners=False).bool().squeeze(0)
-                else:
-                    masks_new = F.interpolate(copied_masks.tensor[None, ...].float(), size=(h_new, w_new), mode="bilinear", align_corners=False).bool().squeeze(0)
-                copied_boxes.scale(1. * unlabeled_w / labeled_w * resize_ratio, 1. * unlabeled_h / labeled_h * resize_ratio)
-                if isinstance(cur_unlabeled_instances.gt_masks, torch.Tensor):
-                    _, mask_w, mask_h = cur_unlabeled_instances.gt_masks.size()
-                else:
-                    _, mask_w, mask_h = cur_unlabeled_instances.gt_masks.tensor.size()
-                masks_new_all = torch.zeros(num_copy, mask_w, mask_h)
-                image_new_all = torch.zeros_like(cur_unlabeled_image)
-                image_new_all[:, h_shift:h_shift+h_new, w_shift:w_shift+w_new] += cur_labeled_image_new
-                masks_new_all[:, h_shift:h_shift+h_new, w_shift:w_shift+w_new] += masks_new
-                cur_labeled_image = image_new_all.byte() #.squeeze(0)
-                if isinstance(copied_masks, torch.Tensor):
-                    copied_masks = masks_new_all.bool() #.squeeze(0)
-                else:
-                    copied_masks.tensor = masks_new_all.bool() #.squeeze(0)
-                copied_boxes.tensor[:, 0] += h_shift
-                copied_boxes.tensor[:, 2] += h_shift
-                copied_boxes.tensor[:, 1] += w_shift
-                copied_boxes.tensor[:, 3] += w_shift
-                copied_instances.gt_masks = copied_masks
-                copied_instances.gt_boxes = copied_boxes
-                copied_instances._image_size = (unlabeled_h, unlabeled_w)
-                if len(cur_unlabeled_instances) == 0:
-                    if isinstance(copied_instances.gt_masks, torch.Tensor):
-                        alpha = copied_instances.gt_masks.sum(0) > 0
-                    else:
-                        alpha = copied_instances.gt_masks.tensor.sum(0) > 0
-                    # merge image
-                    alpha = alpha.cpu()
-                    composited_image = (alpha * cur_labeled_image) + (~alpha * cur_unlabeled_image)
-                    cur_unlabeled_data["image"] = composited_image
-                    cur_unlabeled_data["instances"] = copied_instances
-                else:
-                    # remove the copied object if iou greater than 0.5
-                    if isinstance(copied_masks, torch.Tensor):
-                        iou_matrix = mask_iou_matrix(copied_masks, cur_unlabeled_instances.gt_masks, mode='ioy') # nxN
-                    else:
-                        iou_matrix = mask_iou_matrix(copied_masks.tensor, cur_unlabeled_instances.gt_masks.tensor, mode='ioy') # nxN
-                    keep = iou_matrix.max(1)[0] < 0.5
-                    if keep.sum() == 0:
-                        new_unlabeled_data.append(cur_unlabeled_data)
-                        continue
-                    copied_instances = copied_instances[keep]
-                    # update existing instances in unlabeled image
-                    if isinstance(copied_instances.gt_masks, torch.Tensor):
-                        alpha = copied_instances.gt_masks.sum(0) > 0
-                        cur_unlabeled_instances.gt_masks = ~alpha * cur_unlabeled_instances.gt_masks
-                        areas_unlabeled = cur_unlabeled_instances.gt_masks.sum((1,2))
-                    else:
-                        alpha = copied_instances.gt_masks.tensor.sum(0) > 0
-                        cur_unlabeled_instances.gt_masks.tensor = ~alpha * cur_unlabeled_instances.gt_masks.tensor
-                        areas_unlabeled = cur_unlabeled_instances.gt_masks.tensor.sum((1,2))
-                    # merge image
-                    alpha = alpha.cpu()
-                    composited_image = (alpha * cur_labeled_image) + (~alpha * cur_unlabeled_image)
-                    # merge instances
-                    merged_instances = Instances.cat([cur_unlabeled_instances[areas_unlabeled > 0], copied_instances])
-                    # update boxes
-                    if isinstance(merged_instances.gt_masks, torch.Tensor):
-                        merged_instances.gt_boxes = BitMasks(merged_instances.gt_masks).get_bounding_boxes()
-                        # merged_instances.gt_boxes = merged_instances.gt_masks.get_bounding_boxes()
-                    else:
-                        merged_instances.gt_boxes = merged_instances.gt_masks.get_bounding_boxes()
-                    cur_unlabeled_data["image"] = composited_image
-                    cur_unlabeled_data["instances"] = merged_instances
-                if self.cfg_VISUALIZE_COPY_PASTE:
-                    visualize_data(cur_unlabeled_data, save_path = 'sample_{}.jpg'.format(np.random.randint(5)))
-                new_unlabeled_data.append(cur_unlabeled_data)
-        return new_unlabeled_data
-    def run_step(self):
-        """
-        Implement the standard training logic described above.
-        """
-        assert self.model.training, "[SimpleTrainer] model was changed to eval mode!"
-        start = time.perf_counter()
-        """
-        If you want to do something with the data, you can wrap the dataloader.
-        """
-        data = next(self._data_loader_iter)
-        # print(data, len(data))
-        if self.use_copy_paste:
-            # print('using copy paste')
-            data = self.copy_and_paste(copy.deepcopy(data[::-1]), data)
-        data_time = time.perf_counter() - start
-        """
-        If you want to do something with the losses, you can wrap the model.
-        """
-        loss_dict = self.model(data)
-        if isinstance(loss_dict, torch.Tensor):
-            losses = loss_dict
-            loss_dict = {"total_loss": loss_dict}
-        else:
-            losses = sum(loss_dict.values())
-        """
-        If you need to accumulate gradients or do something similar, you can
-        wrap the optimizer with your custom `zero_grad()` method.
-        """
-        if not torch.isnan(losses):
-            self.optimizer.zero_grad()
-            losses.backward()
-        else:
-            print('Nan loss. Skipped.')
-        self._write_metrics(loss_dict, data_time)
-        """
-        If you need gradient clipping/scaling or other processing, you can
-        wrap the optimizer with your custom `step()` method. But it is
-        suboptimal as explained in https://arxiv.org/abs/2006.15704 Sec 3.2.4
-        """
-        self.optimizer.step()
-class CustomAMPTrainer(CustomSimpleTrainer):
-    """
-    Like :class:`SimpleTrainer`, but uses PyTorch's native automatic mixed precision
-    in the training loop.
-    """
-    def __init__(self, model, data_loader, optimizer, cfg=None, grad_scaler=None, use_copy_paste=False,
-                copy_paste_rate=-1, copy_paste_random_num=None, copy_paste_min_ratio=-1,
-                copy_paste_max_ratio=-1, visualize_copy_paste=False):
-        """
-        Args:
-            model, data_loader, optimizer: same as in :class:`SimpleTrainer`.
-            grad_scaler: torch GradScaler to automatically scale gradients.
-        """
-        unsupported = "AMPTrainer does not support single-process multi-device training!"
-        if isinstance(model, DistributedDataParallel):
-            assert not (model.device_ids and len(model.device_ids) > 1), unsupported
-        assert not isinstance(model, DataParallel), unsupported
-        super().__init__(model, data_loader, optimizer, cfg=cfg, use_copy_paste=use_copy_paste, \
-            copy_paste_rate=copy_paste_rate, copy_paste_random_num=copy_paste_random_num, \
-            copy_paste_min_ratio=copy_paste_min_ratio, copy_paste_max_ratio=copy_paste_max_ratio, \
-            visualize_copy_paste=visualize_copy_paste)
-        if grad_scaler is None:
-            from torch.cuda.amp import GradScaler
-            grad_scaler = GradScaler()
-        self.grad_scaler = grad_scaler
-    def run_step(self):
-        """
-        Implement the AMP training logic.
-        """
-        assert self.model.training, "[AMPTrainer] model was changed to eval mode!"
-        assert torch.cuda.is_available(), "[AMPTrainer] CUDA is required for AMP training!"
-        from torch.cuda.amp import autocast
-        start = time.perf_counter()
-        data = next(self._data_loader_iter)
-        if self.use_copy_paste:
-            # print('using copy paste')
-            data = self.copy_and_paste(copy.deepcopy(data[::-1]), data)
-        data_time = time.perf_counter() - start
-        with autocast():
-            loss_dict = self.model(data)
-            if isinstance(loss_dict, torch.Tensor):
-                losses = loss_dict
-                loss_dict = {"total_loss": loss_dict}
-            else:
-                losses = sum(loss_dict.values())
-        if not torch.isnan(losses):
-            self.optimizer.zero_grad()
-            self.grad_scaler.scale(losses).backward()
-        else:
-            print('Nan loss.')
-        self._write_metrics(loss_dict, data_time)
-        self.grad_scaler.step(self.optimizer)
-        self.grad_scaler.update()
-    def state_dict(self):
-        ret = super().state_dict()
-        ret["grad_scaler"] = self.grad_scaler.state_dict()
-        return ret
-    def load_state_dict(self, state_dict):
-        super().load_state_dict(state_dict)
-        self.grad_scaler.load_state_dict(state_dict["grad_scaler"])

cutler/evaluation/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-from .coco_evaluation import COCOEvaluator

cutler/evaluation/coco_evaluation.py DELETED Viewed

@@ -1,727 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# Modified by XuDong Wang from https://github.com/facebookresearch/detectron2/blob/main/detectron2/evaluation/coco_evaluation.py
-# supports evaluation of object detection only, although the prediction contains both segmentation and detection results.
-import contextlib
-import copy
-import io
-import itertools
-import json
-import logging
-import numpy as np
-import os
-import pickle
-from collections import OrderedDict
-import pycocotools.mask as mask_util
-import torch
-from pycocotools.coco import COCO
-from pycocotools.cocoeval import COCOeval
-from tabulate import tabulate
-import detectron2.utils.comm as comm
-from detectron2.config import CfgNode
-from detectron2.data import MetadataCatalog
-from detectron2.data.datasets.coco import convert_to_coco_json
-from detectron2.structures import Boxes, BoxMode, pairwise_iou
-from detectron2.utils.file_io import PathManager
-from detectron2.utils.logger import create_small_table
-from detectron2.evaluation.evaluator import DatasetEvaluator
-try:
-    from detectron2.evaluation.fast_eval_api import COCOeval_opt
-except ImportError:
-    COCOeval_opt = COCOeval
-class COCOEvaluator(DatasetEvaluator):
-    """
-    Evaluate AR for object proposals, AP for instance detection/segmentation, AP
-    for keypoint detection outputs using COCO's metrics.
-    See http://cocodataset.org/#detection-eval and
-    http://cocodataset.org/#keypoints-eval to understand its metrics.
-    The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means
-    the metric cannot be computed (e.g. due to no predictions made).
-    In addition to COCO, this evaluator is able to support any bounding box detection,
-    instance segmentation, or keypoint detection dataset.
-    """
-    def __init__(
-        self,
-        dataset_name,
-        tasks=None,
-        distributed=True,
-        output_dir=None,
-        *,
-        max_dets_per_image=None,
-        use_fast_impl=True,
-        kpt_oks_sigmas=(),
-        allow_cached_coco=True,
-        no_segm=False,
-    ):
-        """
-        Args:
-            dataset_name (str): name of the dataset to be evaluated.
-                It must have either the following corresponding metadata:
-                    "json_file": the path to the COCO format annotation
-                Or it must be in detectron2's standard dataset format
-                so it can be converted to COCO format automatically.
-            tasks (tuple[str]): tasks that can be evaluated under the given
-                configuration. A task is one of "bbox", "segm", "keypoints".
-                By default, will infer this automatically from predictions.
-            distributed (True): if True, will collect results from all ranks and run evaluation
-                in the main process.
-                Otherwise, will only evaluate the results in the current process.
-            output_dir (str): optional, an output directory to dump all
-                results predicted on the dataset. The dump contains two files:
-                1. "instances_predictions.pth" a file that can be loaded with `torch.load` and
-                   contains all the results in the format they are produced by the model.
-                2. "coco_instances_results.json" a json file in COCO's result format.
-            max_dets_per_image (int): limit on the maximum number of detections per image.
-                By default in COCO, this limit is to 100, but this can be customized
-                to be greater, as is needed in evaluation metrics AP fixed and AP pool
-                (see https://arxiv.org/pdf/2102.01066.pdf)
-                This doesn't affect keypoint evaluation.
-            use_fast_impl (bool): use a fast but **unofficial** implementation to compute AP.
-                Although the results should be very close to the official implementation in COCO
-                API, it is still recommended to compute results with the official API for use in
-                papers. The faster implementation also uses more RAM.
-            kpt_oks_sigmas (list[float]): The sigmas used to calculate keypoint OKS.
-                See http://cocodataset.org/#keypoints-eval
-                When empty, it will use the defaults in COCO.
-                Otherwise it should be the same length as ROI_KEYPOINT_HEAD.NUM_KEYPOINTS.
-            allow_cached_coco (bool): Whether to use cached coco json from previous validation
-                runs. You should set this to False if you need to use different validation data.
-                Defaults to True.
-        """
-        self._logger = logging.getLogger(__name__)
-        self._distributed = distributed
-        self._output_dir = output_dir
-        self.no_segm = no_segm
-        if use_fast_impl and (COCOeval_opt is COCOeval):
-            self._logger.info("Fast COCO eval is not built. Falling back to official COCO eval.")
-            use_fast_impl = False
-        self._use_fast_impl = use_fast_impl
-        # COCOeval requires the limit on the number of detections per image (maxDets) to be a list
-        # with at least 3 elements. The default maxDets in COCOeval is [1, 10, 100], in which the
-        # 3rd element (100) is used as the limit on the number of detections per image when
-        # evaluating AP. COCOEvaluator expects an integer for max_dets_per_image, so for COCOeval,
-        # we reformat max_dets_per_image into [1, 10, max_dets_per_image], based on the defaults.
-        if max_dets_per_image is None:
-            max_dets_per_image = [1, 10, 100]
-        else:
-            max_dets_per_image = [1, 10, max_dets_per_image]
-        self._max_dets_per_image = max_dets_per_image
-        if tasks is not None and isinstance(tasks, CfgNode):
-            kpt_oks_sigmas = (
-                tasks.TEST.KEYPOINT_OKS_SIGMAS if not kpt_oks_sigmas else kpt_oks_sigmas
-            )
-            self._logger.warn(
-                "COCO Evaluator instantiated using config, this is deprecated behavior."
-                " Please pass in explicit arguments instead."
-            )
-            self._tasks = None  # Infering it from predictions should be better
-        else:
-            self._tasks = tasks
-        self._cpu_device = torch.device("cpu")
-        self._metadata = MetadataCatalog.get(dataset_name)
-        if not hasattr(self._metadata, "json_file"):
-            if output_dir is None:
-                raise ValueError(
-                    "output_dir must be provided to COCOEvaluator "
-                    "for datasets not in COCO format."
-                )
-            self._logger.info(f"Trying to convert '{dataset_name}' to COCO format ...")
-            cache_path = os.path.join(output_dir, f"{dataset_name}_coco_format.json")
-            self._metadata.json_file = cache_path
-            convert_to_coco_json(dataset_name, cache_path, allow_cached=allow_cached_coco)
-        json_file = PathManager.get_local_path(self._metadata.json_file)
-        with contextlib.redirect_stdout(io.StringIO()):
-            self._coco_api = COCO(json_file)
-        # Test set json files do not contain annotations (evaluation must be
-        # performed using the COCO evaluation server).
-        self._do_evaluation = "annotations" in self._coco_api.dataset
-        if self._do_evaluation:
-            self._kpt_oks_sigmas = kpt_oks_sigmas
-    def reset(self):
-        self._predictions = []
-    def process(self, inputs, outputs):
-        """
-        Args:
-            inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
-                It is a list of dict. Each dict corresponds to an image and
-                contains keys like "height", "width", "file_name", "image_id".
-            outputs: the outputs of a COCO model. It is a list of dicts with key
-                "instances" that contains :class:`Instances`.
-        """
-        for input, output in zip(inputs, outputs):
-            prediction = {"image_id": input["image_id"]}
-            if "instances" in output:
-                instances = output["instances"].to(self._cpu_device)
-                prediction["instances"] = instances_to_coco_json(instances, input["image_id"])
-            if "proposals" in output:
-                prediction["proposals"] = output["proposals"].to(self._cpu_device)
-            if len(prediction) > 1:
-                self._predictions.append(prediction)
-    def evaluate(self, img_ids=None):
-        """
-        Args:
-            img_ids: a list of image IDs to evaluate on. Default to None for the whole dataset
-        """
-        if self._distributed:
-            comm.synchronize()
-            predictions = comm.gather(self._predictions, dst=0)
-            predictions = list(itertools.chain(*predictions))
-            if not comm.is_main_process():
-                return {}
-        else:
-            predictions = self._predictions
-        if len(predictions) == 0:
-            self._logger.warning("[COCOEvaluator] Did not receive valid predictions.")
-            return {}
-        if self._output_dir:
-            PathManager.mkdirs(self._output_dir)
-            file_path = os.path.join(self._output_dir, "instances_predictions.pth")
-            with PathManager.open(file_path, "wb") as f:
-                torch.save(predictions, f)
-        self._results = OrderedDict()
-        if "proposals" in predictions[0]:
-            self._eval_box_proposals(predictions)
-        if "instances" in predictions[0]:
-            self._eval_predictions(predictions, img_ids=img_ids)
-        # Copy so the caller can do whatever with results
-        return copy.deepcopy(self._results)
-    def _tasks_from_predictions(self, predictions):
-        """
-        Get COCO API "tasks" (i.e. iou_type) from COCO-format predictions.
-        """
-        tasks = {"bbox"}
-        for pred in predictions:
-            if "segmentation" in pred and not self.no_segm:
-                tasks.add("segm")
-            if "keypoints" in pred:
-                tasks.add("keypoints")
-        return sorted(tasks)
-    def _eval_predictions(self, predictions, img_ids=None):
-        """
-        Evaluate predictions. Fill self._results with the metrics of the tasks.
-        """
-        self._logger.info("Preparing results for COCO format ...")
-        coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
-        tasks = self._tasks or self._tasks_from_predictions(coco_results)
-        # unmap the category ids for COCO
-        if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
-            dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
-            all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
-            num_classes = len(all_contiguous_ids)
-            assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
-            reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
-            for result in coco_results:
-                category_id = result["category_id"]
-                assert category_id < num_classes, (
-                    f"A prediction has class={category_id}, "
-                    f"but the dataset only has {num_classes} classes and "
-                    f"predicted class id should be in [0, {num_classes - 1}]."
-                )
-                result["category_id"] = reverse_id_mapping[category_id]
-        if self._output_dir:
-            file_path = os.path.join(self._output_dir, "coco_instances_results.json")
-            self._logger.info("Saving results to {}".format(file_path))
-            with PathManager.open(file_path, "w") as f:
-                f.write(json.dumps(coco_results))
-                f.flush()
-        if not self._do_evaluation:
-            self._logger.info("Annotations are not available for evaluation.")
-            return
-        self._logger.info(
-            "Evaluating predictions with {} COCO API...".format(
-                "unofficial" if self._use_fast_impl else "official"
-            )
-        )
-        for task in sorted(tasks):
-            assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!"
-            coco_eval = (
-                _evaluate_predictions_on_coco(
-                    self._coco_api,
-                    coco_results,
-                    task,
-                    kpt_oks_sigmas=self._kpt_oks_sigmas,
-                    use_fast_impl=self._use_fast_impl,
-                    img_ids=img_ids,
-                    max_dets_per_image=self._max_dets_per_image,
-                )
-                if len(coco_results) > 0
-                else None  # cocoapi does not handle empty results very well
-            )
-            res = self._derive_coco_results(
-                coco_eval, task, class_names=self._metadata.get("thing_classes")
-            )
-            self._results[task] = res
-    def _eval_box_proposals(self, predictions):
-        """
-        Evaluate the box proposals in predictions.
-        Fill self._results with the metrics for "box_proposals" task.
-        """
-        if self._output_dir:
-            # Saving generated box proposals to file.
-            # Predicted box_proposals are in XYXY_ABS mode.
-            bbox_mode = BoxMode.XYXY_ABS.value
-            ids, boxes, objectness_logits = [], [], []
-            for prediction in predictions:
-                ids.append(prediction["image_id"])
-                boxes.append(prediction["proposals"].proposal_boxes.tensor.numpy())
-                objectness_logits.append(prediction["proposals"].objectness_logits.numpy())
-            proposal_data = {
-                "boxes": boxes,
-                "objectness_logits": objectness_logits,
-                "ids": ids,
-                "bbox_mode": bbox_mode,
-            }
-            with PathManager.open(os.path.join(self._output_dir, "box_proposals.pkl"), "wb") as f:
-                pickle.dump(proposal_data, f)
-        if not self._do_evaluation:
-            self._logger.info("Annotations are not available for evaluation.")
-            return
-        self._logger.info("Evaluating bbox proposals ...")
-        res = {}
-        areas = {"all": "", "small": "s", "medium": "m", "large": "l"}
-        for limit in [100, 1000]:
-            for area, suffix in areas.items():
-                stats = _evaluate_box_proposals(predictions, self._coco_api, area=area, limit=limit)
-                key = "AR{}@{:d}".format(suffix, limit)
-                res[key] = float(stats["ar"].item() * 100)
-        self._logger.info("Proposal metrics: \n" + create_small_table(res))
-        self._results["box_proposals"] = res
-    def _derive_coco_results(self, coco_eval, iou_type, class_names=None):
-        """
-        Derive the desired score numbers from summarized COCOeval.
-        Args:
-            coco_eval (None or COCOEval): None represents no predictions from model.
-            iou_type (str):
-            class_names (None or list[str]): if provided, will use it to predict
-                per-category AP.
-        Returns:
-            a dict of {metric name: score}
-        """
-        metrics = {
-            "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl"],
-            "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl"],
-            "keypoints": ["AP", "AP50", "AP75", "APm", "APl"],
-        }[iou_type]
-        if coco_eval is None:
-            self._logger.warn("No predictions from the model!")
-            return {metric: float("nan") for metric in metrics}
-        # the standard metrics
-        results = {
-            metric: float(coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan")
-            for idx, metric in enumerate(metrics)
-        }
-        self._logger.info(
-            "Evaluation results for {}: \n".format(iou_type) + create_small_table(results)
-        )
-        if not np.isfinite(sum(results.values())):
-            self._logger.info("Some metrics cannot be computed and is shown as NaN.")
-        if class_names is None or len(class_names) <= 1:
-            return results
-        # Compute per-category AP
-        # from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa
-        precisions = coco_eval.eval["precision"]
-        # precision has dims (iou, recall, cls, area range, max dets)
-        assert len(class_names) == precisions.shape[2]
-        results_per_category = []
-        for idx, name in enumerate(class_names):
-            # area range index 0: all area ranges
-            # max dets index -1: typically 100 per image
-            precision = precisions[:, :, idx, 0, -1]
-            precision = precision[precision > -1]
-            ap = np.mean(precision) if precision.size else float("nan")
-            results_per_category.append(("{}".format(name), float(ap * 100)))
-        # tabulate it
-        N_COLS = min(6, len(results_per_category) * 2)
-        results_flatten = list(itertools.chain(*results_per_category))
-        results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)])
-        table = tabulate(
-            results_2d,
-            tablefmt="pipe",
-            floatfmt=".3f",
-            headers=["category", "AP"] * (N_COLS // 2),
-            numalign="left",
-        )
-        self._logger.info("Per-category {} AP: \n".format(iou_type) + table)
-        results.update({"AP-" + name: ap for name, ap in results_per_category})
-        return results
-def instances_to_coco_json(instances, img_id):
-    """
-    Dump an "Instances" object to a COCO-format json that's used for evaluation.
-    Args:
-        instances (Instances):
-        img_id (int): the image id
-    Returns:
-        list[dict]: list of json annotations in COCO format.
-    """
-    num_instance = len(instances)
-    if num_instance == 0:
-        return []
-    boxes = instances.pred_boxes.tensor.numpy()
-    boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
-    boxes = boxes.tolist()
-    scores = instances.scores.tolist()
-    classes = instances.pred_classes.tolist()
-    has_mask = instances.has("pred_masks")
-    if has_mask:
-        # use RLE to encode the masks, because they are too large and takes memory
-        # since this evaluator stores outputs of the entire dataset
-        rles = [
-            mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0]
-            for mask in instances.pred_masks
-        ]
-        for rle in rles:
-            # "counts" is an array encoded by mask_util as a byte-stream. Python3's
-            # json writer which always produces strings cannot serialize a bytestream
-            # unless you decode it. Thankfully, utf-8 works out (which is also what
-            # the pycocotools/_mask.pyx does).
-            rle["counts"] = rle["counts"].decode("utf-8")
-    has_keypoints = instances.has("pred_keypoints")
-    if has_keypoints:
-        keypoints = instances.pred_keypoints
-    results = []
-    for k in range(num_instance):
-        result = {
-            "image_id": img_id,
-            "category_id": classes[k],
-            "bbox": boxes[k],
-            "score": scores[k],
-        }
-        if has_mask:
-            result["segmentation"] = rles[k]
-        if has_keypoints:
-            # In COCO annotations,
-            # keypoints coordinates are pixel indices.
-            # However our predictions are floating point coordinates.
-            # Therefore we subtract 0.5 to be consistent with the annotation format.
-            # This is the inverse of data loading logic in `datasets/coco.py`.
-            keypoints[k][:, :2] -= 0.5
-            result["keypoints"] = keypoints[k].flatten().tolist()
-        results.append(result)
-    return results
-# inspired from Detectron:
-# https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L255 # noqa
-def _evaluate_box_proposals(dataset_predictions, coco_api, thresholds=None, area="all", limit=None):
-    """
-    Evaluate detection proposal recall metrics. This function is a much
-    faster alternative to the official COCO API recall evaluation code. However,
-    it produces slightly different results.
-    """
-    # Record max overlap value for each gt box
-    # Return vector of overlap values
-    areas = {
-        "all": 0,
-        "small": 1,
-        "medium": 2,
-        "large": 3,
-        "96-128": 4,
-        "128-256": 5,
-        "256-512": 6,
-        "512-inf": 7,
-    }
-    area_ranges = [
-        [0**2, 1e5**2],  # all
-        [0**2, 32**2],  # small
-        [32**2, 96**2],  # medium
-        [96**2, 1e5**2],  # large
-        [96**2, 128**2],  # 96-128
-        [128**2, 256**2],  # 128-256
-        [256**2, 512**2],  # 256-512
-        [512**2, 1e5**2],
-    ]  # 512-inf
-    assert area in areas, "Unknown area range: {}".format(area)
-    area_range = area_ranges[areas[area]]
-    gt_overlaps = []
-    num_pos = 0
-    for prediction_dict in dataset_predictions:
-        predictions = prediction_dict["proposals"]
-        # sort predictions in descending order
-        # TODO maybe remove this and make it explicit in the documentation
-        inds = predictions.objectness_logits.sort(descending=True)[1]
-        predictions = predictions[inds]
-        ann_ids = coco_api.getAnnIds(imgIds=prediction_dict["image_id"])
-        anno = coco_api.loadAnns(ann_ids)
-        gt_boxes = [
-            BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS)
-            for obj in anno
-            if obj["iscrowd"] == 0
-        ]
-        gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4)  # guard against no boxes
-        gt_boxes = Boxes(gt_boxes)
-        gt_areas = torch.as_tensor([obj["area"] for obj in anno if obj["iscrowd"] == 0])
-        if len(gt_boxes) == 0 or len(predictions) == 0:
-            continue
-        valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1])
-        gt_boxes = gt_boxes[valid_gt_inds]
-        num_pos += len(gt_boxes)
-        if len(gt_boxes) == 0:
-            continue
-        if limit is not None and len(predictions) > limit:
-            predictions = predictions[:limit]
-        overlaps = pairwise_iou(predictions.proposal_boxes, gt_boxes)
-        _gt_overlaps = torch.zeros(len(gt_boxes))
-        for j in range(min(len(predictions), len(gt_boxes))):
-            # find which proposal box maximally covers each gt box
-            # and get the iou amount of coverage for each gt box
-            max_overlaps, argmax_overlaps = overlaps.max(dim=0)
-            # find which gt box is 'best' covered (i.e. 'best' = most iou)
-            gt_ovr, gt_ind = max_overlaps.max(dim=0)
-            assert gt_ovr >= 0
-            # find the proposal box that covers the best covered gt box
-            box_ind = argmax_overlaps[gt_ind]
-            # record the iou coverage of this gt box
-            _gt_overlaps[j] = overlaps[box_ind, gt_ind]
-            assert _gt_overlaps[j] == gt_ovr
-            # mark the proposal box and the gt box as used
-            overlaps[box_ind, :] = -1
-            overlaps[:, gt_ind] = -1
-        # append recorded iou coverage level
-        gt_overlaps.append(_gt_overlaps)
-    gt_overlaps = (
-        torch.cat(gt_overlaps, dim=0) if len(gt_overlaps) else torch.zeros(0, dtype=torch.float32)
-    )
-    gt_overlaps, _ = torch.sort(gt_overlaps)
-    if thresholds is None:
-        step = 0.05
-        thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32)
-    recalls = torch.zeros_like(thresholds)
-    # compute recall for each iou threshold
-    for i, t in enumerate(thresholds):
-        recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos)
-    # ar = 2 * np.trapz(recalls, thresholds)
-    ar = recalls.mean()
-    return {
-        "ar": ar,
-        "recalls": recalls,
-        "thresholds": thresholds,
-        "gt_overlaps": gt_overlaps,
-        "num_pos": num_pos,
-    }
-def _evaluate_predictions_on_coco(
-    coco_gt,
-    coco_results,
-    iou_type,
-    kpt_oks_sigmas=None,
-    use_fast_impl=True,
-    img_ids=None,
-    max_dets_per_image=None,
-):
-    """
-    Evaluate the coco results using COCOEval API.
-    """
-    assert len(coco_results) > 0
-    if iou_type == "segm":
-        coco_results = copy.deepcopy(coco_results)
-        # When evaluating mask AP, if the results contain bbox, cocoapi will
-        # use the box area as the area of the instance, instead of the mask area.
-        # This leads to a different definition of small/medium/large.
-        # We remove the bbox field to let mask AP use mask area.
-        for c in coco_results:
-            c.pop("bbox", None)
-    coco_dt = coco_gt.loadRes(coco_results)
-    coco_eval = (COCOeval_opt if use_fast_impl else COCOeval)(coco_gt, coco_dt, iou_type)
-    # For COCO, the default max_dets_per_image is [1, 10, 100].
-    if max_dets_per_image is None:
-        max_dets_per_image = [1, 10, 100]  # Default from COCOEval
-    else:
-        assert (
-            len(max_dets_per_image) >= 3
-        ), "COCOeval requires maxDets (and max_dets_per_image) to have length at least 3"
-        # In the case that user supplies a custom input for max_dets_per_image,
-        # apply COCOevalMaxDets to evaluate AP with the custom input.
-        if max_dets_per_image[2] != 100:
-            coco_eval = COCOevalMaxDets(coco_gt, coco_dt, iou_type)
-    if iou_type != "keypoints":
-        coco_eval.params.maxDets = max_dets_per_image
-    if img_ids is not None:
-        coco_eval.params.imgIds = img_ids
-    if iou_type == "keypoints":
-        # Use the COCO default keypoint OKS sigmas unless overrides are specified
-        if kpt_oks_sigmas:
-            assert hasattr(coco_eval.params, "kpt_oks_sigmas"), "pycocotools is too old!"
-            coco_eval.params.kpt_oks_sigmas = np.array(kpt_oks_sigmas)
-        # COCOAPI requires every detection and every gt to have keypoints, so
-        # we just take the first entry from both
-        num_keypoints_dt = len(coco_results[0]["keypoints"]) // 3
-        num_keypoints_gt = len(next(iter(coco_gt.anns.values()))["keypoints"]) // 3
-        num_keypoints_oks = len(coco_eval.params.kpt_oks_sigmas)
-        assert num_keypoints_oks == num_keypoints_dt == num_keypoints_gt, (
-            f"[COCOEvaluator] Prediction contain {num_keypoints_dt} keypoints. "
-            f"Ground truth contains {num_keypoints_gt} keypoints. "
-            f"The length of cfg.TEST.KEYPOINT_OKS_SIGMAS is {num_keypoints_oks}. "
-            "They have to agree with each other. For meaning of OKS, please refer to "
-            "http://cocodataset.org/#keypoints-eval."
-        )
-    coco_eval.evaluate()
-    coco_eval.accumulate()
-    coco_eval.summarize()
-    return coco_eval
-class COCOevalMaxDets(COCOeval):
-    """
-    Modified version of COCOeval for evaluating AP with a custom
-    maxDets (by default for COCO, maxDets is 100)
-    """
-    def summarize(self):
-        """
-        Compute and display summary metrics for evaluation results given
-        a custom value for  max_dets_per_image
-        """
-        def _summarize(ap=1, iouThr=None, areaRng="all", maxDets=100):
-            p = self.params
-            iStr = " {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}"
-            titleStr = "Average Precision" if ap == 1 else "Average Recall"
-            typeStr = "(AP)" if ap == 1 else "(AR)"
-            iouStr = (
-                "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1])
-                if iouThr is None
-                else "{:0.2f}".format(iouThr)
-            )
-            aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
-            mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
-            if ap == 1:
-                # dimension of precision: [TxRxKxAxM]
-                s = self.eval["precision"]
-                # IoU
-                if iouThr is not None:
-                    t = np.where(iouThr == p.iouThrs)[0]
-                    s = s[t]
-                s = s[:, :, :, aind, mind]
-            else:
-                # dimension of recall: [TxKxAxM]
-                s = self.eval["recall"]
-                if iouThr is not None:
-                    t = np.where(iouThr == p.iouThrs)[0]
-                    s = s[t]
-                s = s[:, :, aind, mind]
-            if len(s[s > -1]) == 0:
-                mean_s = -1
-            else:
-                mean_s = np.mean(s[s > -1])
-            print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s))
-            return mean_s
-        def _summarizeDets():
-            stats = np.zeros((12,))
-            # Evaluate AP using the custom limit on maximum detections per image
-            stats[0] = _summarize(1, maxDets=self.params.maxDets[2])
-            stats[1] = _summarize(1, iouThr=0.5, maxDets=self.params.maxDets[2])
-            stats[2] = _summarize(1, iouThr=0.75, maxDets=self.params.maxDets[2])
-            stats[3] = _summarize(1, areaRng="small", maxDets=self.params.maxDets[2])
-            stats[4] = _summarize(1, areaRng="medium", maxDets=self.params.maxDets[2])
-            stats[5] = _summarize(1, areaRng="large", maxDets=self.params.maxDets[2])
-            stats[6] = _summarize(0, maxDets=self.params.maxDets[0])
-            stats[7] = _summarize(0, maxDets=self.params.maxDets[1])
-            stats[8] = _summarize(0, maxDets=self.params.maxDets[2])
-            stats[9] = _summarize(0, areaRng="small", maxDets=self.params.maxDets[2])
-            stats[10] = _summarize(0, areaRng="medium", maxDets=self.params.maxDets[2])
-            stats[11] = _summarize(0, areaRng="large", maxDets=self.params.maxDets[2])
-            return stats
-        def _summarizeKps():
-            stats = np.zeros((10,))
-            stats[0] = _summarize(1, maxDets=20)
-            stats[1] = _summarize(1, maxDets=20, iouThr=0.5)
-            stats[2] = _summarize(1, maxDets=20, iouThr=0.75)
-            stats[3] = _summarize(1, maxDets=20, areaRng="medium")
-            stats[4] = _summarize(1, maxDets=20, areaRng="large")
-            stats[5] = _summarize(0, maxDets=20)
-            stats[6] = _summarize(0, maxDets=20, iouThr=0.5)
-            stats[7] = _summarize(0, maxDets=20, iouThr=0.75)
-            stats[8] = _summarize(0, maxDets=20, areaRng="medium")
-            stats[9] = _summarize(0, maxDets=20, areaRng="large")
-            return stats
-        if not self.eval:
-            raise Exception("Please run accumulate() first")
-        iouType = self.params.iouType
-        if iouType == "segm" or iouType == "bbox":
-            summarize = _summarizeDets
-        elif iouType == "keypoints":
-            summarize = _summarizeKps
-        self.stats = summarize()
-    def __str__(self):
-        self.summarize()

cutler/model_zoo/configs/Base-RCNN-FPN.yaml DELETED Viewed

@@ -1,42 +0,0 @@
-MODEL:
-  META_ARCHITECTURE: "GeneralizedRCNN"
-  BACKBONE:
-    NAME: "build_resnet_fpn_backbone"
-  RESNETS:
-    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
-  FPN:
-    IN_FEATURES: ["res2", "res3", "res4", "res5"]
-  ANCHOR_GENERATOR:
-    SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
-    ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
-  RPN:
-    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
-    PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
-    PRE_NMS_TOPK_TEST: 1000  # Per FPN level
-    # Detectron1 uses 2000 proposals per-batch,
-    # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
-    # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
-    POST_NMS_TOPK_TRAIN: 1000
-    POST_NMS_TOPK_TEST: 1000
-  ROI_HEADS:
-    NAME: "StandardROIHeads"
-    IN_FEATURES: ["p2", "p3", "p4", "p5"]
-  ROI_BOX_HEAD:
-    NAME: "FastRCNNConvFCHead"
-    NUM_FC: 2
-    POOLER_RESOLUTION: 7
-  ROI_MASK_HEAD:
-    NAME: "MaskRCNNConvUpsampleHead"
-    NUM_CONV: 4
-    POOLER_RESOLUTION: 14
-DATASETS:
-  TRAIN: ("coco_2017_train",)
-  TEST: ("coco_2017_val",)
-SOLVER:
-  IMS_PER_BATCH: 16
-  BASE_LR: 0.02
-  STEPS: (60000, 80000)
-  MAX_ITER: 90000
-INPUT:
-  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
-VERSION: 2

cutler/model_zoo/configs/COCO-Semisupervised/cascade_mask_rcnn_R_50_FPN_100perc.yaml DELETED Viewed

@@ -1,40 +0,0 @@
-_BASE_: "../Base-RCNN-FPN.yaml"
-MODEL:
-  PIXEL_MEAN: [123.675, 116.280, 103.530]
-  PIXEL_STD: [58.395, 57.120, 57.375]
-  WEIGHTS: "http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_cascade_final.pth"
-  MASK_ON: True
-  BACKBONE:
-    FREEZE_AT: 0
-  RESNETS:
-    DEPTH: 50
-    NORM: "SyncBN"
-    STRIDE_IN_1X1: False
-  FPN:
-    NORM: "SyncBN"
-  ROI_BOX_HEAD:
-    CLS_AGNOSTIC_BBOX_REG: True
-  ROI_HEADS:
-    NAME: CustomCascadeROIHeads
-  RPN:
-    POST_NMS_TOPK_TRAIN: 2000
-DATASETS:
-  TRAIN: ("coco_2017_train",)
-  TEST: ("coco_2017_val",)
-SOLVER:
-  IMS_PER_BATCH: 16
-  BASE_LR: 0.02
-  STEPS: (60000, 80000)
-  MAX_ITER: 90000
-  BASE_LR_MULTIPLIER: 2
-  BASE_LR_MULTIPLIER_NAMES: ['roi_heads.mask_head.predictor', 'roi_heads.box_predictor.0.cls_score', 'roi_heads.box_predictor.0.bbox_pred', 'roi_heads.box_predictor.1.cls_score', 'roi_heads.box_predictor.1.bbox_pred', 'roi_heads.box_predictor.2.cls_score', 'roi_heads.box_predictor.2.bbox_pred']
-INPUT:
-  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
-  MAX_SIZE_TRAIN: 1333
-  MASK_FORMAT: "bitmask"
-  FORMAT: "RGB"
-TEST:
-  PRECISE_BN:
-    ENABLED: True
-  EVAL_PERIOD: 5000
-OUTPUT_DIR: "output/100perc"

cutler/model_zoo/configs/COCO-Semisupervised/cascade_mask_rcnn_R_50_FPN_10perc.yaml DELETED Viewed

@@ -1,40 +0,0 @@
-_BASE_: "../Base-RCNN-FPN.yaml"
-MODEL:
-  PIXEL_MEAN: [123.675, 116.280, 103.530]
-  PIXEL_STD: [58.395, 57.120, 57.375]
-  WEIGHTS: "http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_cascade_final.pth"
-  MASK_ON: True
-  BACKBONE:
-    FREEZE_AT: 0
-  RESNETS:
-    DEPTH: 50
-    NORM: "SyncBN"
-    STRIDE_IN_1X1: False
-  FPN:
-    NORM: "SyncBN"
-  ROI_BOX_HEAD:
-    CLS_AGNOSTIC_BBOX_REG: True
-  ROI_HEADS:
-    NAME: CustomCascadeROIHeads
-  RPN:
-    POST_NMS_TOPK_TRAIN: 2000
-DATASETS:
-  TRAIN: ("coco_semi_10perc",)
-  TEST: ("coco_2017_val",)
-SOLVER:
-  IMS_PER_BATCH: 16
-  BASE_LR: 0.04
-  STEPS: (6000, 8000)
-  MAX_ITER: 9000
-  BASE_LR_MULTIPLIER: 4
-  BASE_LR_MULTIPLIER_NAMES: ['roi_heads.mask_head.predictor', 'roi_heads.box_predictor.0.cls_score', 'roi_heads.box_predictor.0.bbox_pred', 'roi_heads.box_predictor.1.cls_score', 'roi_heads.box_predictor.1.bbox_pred', 'roi_heads.box_predictor.2.cls_score', 'roi_heads.box_predictor.2.bbox_pred']
-INPUT:
-  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
-  MAX_SIZE_TRAIN: 1333
-  MASK_FORMAT: "bitmask"
-  FORMAT: "RGB"
-TEST:
-  PRECISE_BN:
-    ENABLED: True
-  EVAL_PERIOD: 5000
-OUTPUT_DIR: "output/10perc"

cutler/model_zoo/configs/COCO-Semisupervised/cascade_mask_rcnn_R_50_FPN_1perc.yaml DELETED Viewed

@@ -1,42 +0,0 @@
-_BASE_: "../Base-RCNN-FPN.yaml"
-MODEL:
-  PIXEL_MEAN: [123.675, 116.280, 103.530]
-  PIXEL_STD: [58.395, 57.120, 57.375]
-  WEIGHTS: "http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_cascade_final.pth"
-  MASK_ON: True
-  BACKBONE:
-    FREEZE_AT: 0
-  RESNETS:
-    DEPTH: 50
-    NORM: "SyncBN"
-    STRIDE_IN_1X1: False
-  FPN:
-    NORM: "SyncBN"
-  ROI_BOX_HEAD:
-    CLS_AGNOSTIC_BBOX_REG: True
-  ROI_HEADS:
-    NAME: CustomCascadeROIHeads
-  RPN:
-    POST_NMS_TOPK_TRAIN: 2000
-DATASETS:
-  TRAIN: ("coco_semi_1perc",)
-  TEST: ("coco_2017_val",)
-SOLVER:
-  IMS_PER_BATCH: 16
-  BASE_LR: 0.04
-  STEPS: (2400, 3200)
-  MAX_ITER: 3600
-  WARMUP_FACTOR: 0.001
-  WARMUP_ITERS: 1000
-  BASE_LR_MULTIPLIER: 4
-  BASE_LR_MULTIPLIER_NAMES: ['roi_heads.mask_head.predictor', 'roi_heads.box_predictor.0.cls_score', 'roi_heads.box_predictor.0.bbox_pred', 'roi_heads.box_predictor.1.cls_score', 'roi_heads.box_predictor.1.bbox_pred', 'roi_heads.box_predictor.2.cls_score', 'roi_heads.box_predictor.2.bbox_pred']
-INPUT:
-  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
-  MAX_SIZE_TRAIN: 1333
-  MASK_FORMAT: "bitmask"
-  FORMAT: "RGB"
-TEST:
-  PRECISE_BN:
-    ENABLED: True
-  EVAL_PERIOD: 5000
-OUTPUT_DIR: "output/1perc"

cutler/model_zoo/configs/COCO-Semisupervised/cascade_mask_rcnn_R_50_FPN_20perc.yaml DELETED Viewed

@@ -1,40 +0,0 @@
-_BASE_: "../Base-RCNN-FPN.yaml"
-MODEL:
-  PIXEL_MEAN: [123.675, 116.280, 103.530]
-  PIXEL_STD: [58.395, 57.120, 57.375]
-  WEIGHTS: "http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_cascade_final.pth"
-  MASK_ON: True
-  BACKBONE:
-    FREEZE_AT: 0
-  RESNETS:
-    DEPTH: 50
-    NORM: "SyncBN"
-    STRIDE_IN_1X1: False
-  FPN:
-    NORM: "SyncBN"
-  ROI_BOX_HEAD:
-    CLS_AGNOSTIC_BBOX_REG: True
-  ROI_HEADS:
-    NAME: CustomCascadeROIHeads
-  RPN:
-    POST_NMS_TOPK_TRAIN: 2000
-DATASETS:
-  TRAIN: ("coco_semi_20perc",)
-  TEST: ("coco_2017_val",)
-SOLVER:
-  IMS_PER_BATCH: 16
-  BASE_LR: 0.04
-  STEPS: (12000, 16000)
-  MAX_ITER: 18000
-  BASE_LR_MULTIPLIER: 4
-  BASE_LR_MULTIPLIER_NAMES: ['roi_heads.mask_head.predictor', 'roi_heads.box_predictor.0.cls_score', 'roi_heads.box_predictor.0.bbox_pred', 'roi_heads.box_predictor.1.cls_score', 'roi_heads.box_predictor.1.bbox_pred', 'roi_heads.box_predictor.2.cls_score', 'roi_heads.box_predictor.2.bbox_pred']
-INPUT:
-  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
-  MAX_SIZE_TRAIN: 1333
-  MASK_FORMAT: "bitmask"
-  FORMAT: "RGB"
-TEST:
-  PRECISE_BN:
-    ENABLED: True
-  EVAL_PERIOD: 5000
-OUTPUT_DIR: "output/20perc"

cutler/model_zoo/configs/COCO-Semisupervised/cascade_mask_rcnn_R_50_FPN_2perc.yaml DELETED Viewed

@@ -1,42 +0,0 @@
-_BASE_: "../Base-RCNN-FPN.yaml"
-MODEL:
-  PIXEL_MEAN: [123.675, 116.280, 103.530]
-  PIXEL_STD: [58.395, 57.120, 57.375]
-  WEIGHTS: "http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_cascade_final.pth"
-  MASK_ON: True
-  BACKBONE:
-    FREEZE_AT: 0
-  RESNETS:
-    DEPTH: 50
-    NORM: "SyncBN"
-    STRIDE_IN_1X1: False
-  FPN:
-    NORM: "SyncBN"
-  ROI_BOX_HEAD:
-    CLS_AGNOSTIC_BBOX_REG: True
-  ROI_HEADS:
-    NAME: CustomCascadeROIHeads
-  RPN:
-    POST_NMS_TOPK_TRAIN: 2000
-DATASETS:
-  TRAIN: ("coco_semi_2perc",)
-  TEST: ("coco_2017_val",)
-SOLVER:
-  IMS_PER_BATCH: 16
-  BASE_LR: 0.04
-  STEPS: (2400, 3200)
-  MAX_ITER: 3600
-  WARMUP_FACTOR: 0.001
-  WARMUP_ITERS: 1000
-  BASE_LR_MULTIPLIER: 4
-  BASE_LR_MULTIPLIER_NAMES: ['roi_heads.mask_head.predictor', 'roi_heads.box_predictor.0.cls_score', 'roi_heads.box_predictor.0.bbox_pred', 'roi_heads.box_predictor.1.cls_score', 'roi_heads.box_predictor.1.bbox_pred', 'roi_heads.box_predictor.2.cls_score', 'roi_heads.box_predictor.2.bbox_pred']
-INPUT:
-  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
-  MAX_SIZE_TRAIN: 1333
-  MASK_FORMAT: "bitmask"
-  FORMAT: "RGB"
-TEST:
-  PRECISE_BN:
-    ENABLED: True
-  EVAL_PERIOD: 5000
-OUTPUT_DIR: "output/2perc"

cutler/model_zoo/configs/COCO-Semisupervised/cascade_mask_rcnn_R_50_FPN_30perc.yaml DELETED Viewed

@@ -1,40 +0,0 @@
-_BASE_: "../Base-RCNN-FPN.yaml"
-MODEL:
-  PIXEL_MEAN: [123.675, 116.280, 103.530]
-  PIXEL_STD: [58.395, 57.120, 57.375]
-  WEIGHTS: "http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_cascade_final.pth"
-  MASK_ON: True
-  BACKBONE:
-    FREEZE_AT: 0
-  RESNETS:
-    DEPTH: 50
-    NORM: "SyncBN"
-    STRIDE_IN_1X1: False
-  FPN:
-    NORM: "SyncBN"
-  ROI_BOX_HEAD:
-    CLS_AGNOSTIC_BBOX_REG: True
-  ROI_HEADS:
-    NAME: CustomCascadeROIHeads
-  RPN:
-    POST_NMS_TOPK_TRAIN: 2000
-DATASETS:
-  TRAIN: ("coco_semi_30perc",)
-  TEST: ("coco_2017_val",)
-SOLVER:
-  IMS_PER_BATCH: 16
-  BASE_LR: 0.04
-  STEPS: (18000, 24000)
-  MAX_ITER: 27000
-  BASE_LR_MULTIPLIER: 4
-  BASE_LR_MULTIPLIER_NAMES: ['roi_heads.mask_head.predictor', 'roi_heads.box_predictor.0.cls_score', 'roi_heads.box_predictor.0.bbox_pred', 'roi_heads.box_predictor.1.cls_score', 'roi_heads.box_predictor.1.bbox_pred', 'roi_heads.box_predictor.2.cls_score', 'roi_heads.box_predictor.2.bbox_pred']
-INPUT:
-  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
-  MAX_SIZE_TRAIN: 1333
-  MASK_FORMAT: "bitmask"
-  FORMAT: "RGB"
-TEST:
-  PRECISE_BN:
-    ENABLED: True
-  EVAL_PERIOD: 5000
-OUTPUT_DIR: "output/30perc"

cutler/model_zoo/configs/COCO-Semisupervised/cascade_mask_rcnn_R_50_FPN_40perc.yaml DELETED Viewed

@@ -1,40 +0,0 @@
-_BASE_: "../Base-RCNN-FPN.yaml"
-MODEL:
-  PIXEL_MEAN: [123.675, 116.280, 103.530]
-  PIXEL_STD: [58.395, 57.120, 57.375]
-  WEIGHTS: "http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_cascade_final.pth"
-  MASK_ON: True
-  BACKBONE:
-    FREEZE_AT: 0
-  RESNETS:
-    DEPTH: 50
-    NORM: "SyncBN"
-    STRIDE_IN_1X1: False
-  FPN:
-    NORM: "SyncBN"
-  ROI_BOX_HEAD:
-    CLS_AGNOSTIC_BBOX_REG: True
-  ROI_HEADS:
-    NAME: CustomCascadeROIHeads
-  RPN:
-    POST_NMS_TOPK_TRAIN: 2000
-DATASETS:
-  TRAIN: ("coco_semi_40perc",)
-  TEST: ("coco_2017_val",)
-SOLVER:
-  IMS_PER_BATCH: 16
-  BASE_LR: 0.04
-  STEPS: (24000, 32000)
-  MAX_ITER: 36000
-  BASE_LR_MULTIPLIER: 4
-  BASE_LR_MULTIPLIER_NAMES: ['roi_heads.mask_head.predictor', 'roi_heads.box_predictor.0.cls_score', 'roi_heads.box_predictor.0.bbox_pred', 'roi_heads.box_predictor.1.cls_score', 'roi_heads.box_predictor.1.bbox_pred', 'roi_heads.box_predictor.2.cls_score', 'roi_heads.box_predictor.2.bbox_pred']
-INPUT:
-  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
-  MAX_SIZE_TRAIN: 1333
-  MASK_FORMAT: "bitmask"
-  FORMAT: "RGB"
-TEST:
-  PRECISE_BN:
-    ENABLED: True
-  EVAL_PERIOD: 5000
-OUTPUT_DIR: "output/40perc"

cutler/model_zoo/configs/COCO-Semisupervised/cascade_mask_rcnn_R_50_FPN_50perc.yaml DELETED Viewed

@@ -1,40 +0,0 @@
-_BASE_: "../Base-RCNN-FPN.yaml"
-MODEL:
-  PIXEL_MEAN: [123.675, 116.280, 103.530]
-  PIXEL_STD: [58.395, 57.120, 57.375]
-  WEIGHTS: "http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_cascade_final.pth"
-  MASK_ON: True
-  BACKBONE:
-    FREEZE_AT: 0
-  RESNETS:
-    DEPTH: 50
-    NORM: "SyncBN"
-    STRIDE_IN_1X1: False
-  FPN:
-    NORM: "SyncBN"
-  ROI_BOX_HEAD:
-    CLS_AGNOSTIC_BBOX_REG: True
-  ROI_HEADS:
-    NAME: CustomCascadeROIHeads
-  RPN:
-    POST_NMS_TOPK_TRAIN: 2000
-DATASETS:
-  TRAIN: ("coco_semi_50perc",)
-  TEST: ("coco_2017_val",)
-SOLVER:
-  IMS_PER_BATCH: 16
-  BASE_LR: 0.02
-  STEPS: (30000, 40000)
-  MAX_ITER: 45000
-  BASE_LR_MULTIPLIER: 2
-  BASE_LR_MULTIPLIER_NAMES: ['roi_heads.mask_head.predictor', 'roi_heads.box_predictor.0.cls_score', 'roi_heads.box_predictor.0.bbox_pred', 'roi_heads.box_predictor.1.cls_score', 'roi_heads.box_predictor.1.bbox_pred', 'roi_heads.box_predictor.2.cls_score', 'roi_heads.box_predictor.2.bbox_pred']
-INPUT:
-  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
-  MAX_SIZE_TRAIN: 1333
-  MASK_FORMAT: "bitmask"
-  FORMAT: "RGB"
-TEST:
-  PRECISE_BN:
-    ENABLED: True
-  EVAL_PERIOD: 5000
-OUTPUT_DIR: "output/50perc"

cutler/model_zoo/configs/COCO-Semisupervised/cascade_mask_rcnn_R_50_FPN_5perc.yaml DELETED Viewed

@@ -1,42 +0,0 @@
-_BASE_: "../Base-RCNN-FPN.yaml"
-MODEL:
-  PIXEL_MEAN: [123.675, 116.280, 103.530]
-  PIXEL_STD: [58.395, 57.120, 57.375]
-  WEIGHTS: "http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_cascade_final.pth"
-  MASK_ON: True
-  BACKBONE:
-    FREEZE_AT: 0
-  RESNETS:
-    DEPTH: 50
-    NORM: "SyncBN"
-    STRIDE_IN_1X1: False
-  FPN:
-    NORM: "SyncBN"
-  ROI_BOX_HEAD:
-    CLS_AGNOSTIC_BBOX_REG: True
-  ROI_HEADS:
-    NAME: CustomCascadeROIHeads
-  RPN:
-    POST_NMS_TOPK_TRAIN: 2000
-DATASETS:
-  TRAIN: ("coco_semi_5perc",)
-  TEST: ("coco_2017_val",)
-SOLVER:
-  IMS_PER_BATCH: 16
-  BASE_LR: 0.04
-  STEPS: (3000, 4000)
-  MAX_ITER: 4500
-  WARMUP_FACTOR: 0.001
-  WARMUP_ITERS: 1000
-  BASE_LR_MULTIPLIER: 4
-  BASE_LR_MULTIPLIER_NAMES: ['roi_heads.mask_head.predictor', 'roi_heads.box_predictor.0.cls_score', 'roi_heads.box_predictor.0.bbox_pred', 'roi_heads.box_predictor.1.cls_score', 'roi_heads.box_predictor.1.bbox_pred', 'roi_heads.box_predictor.2.cls_score', 'roi_heads.box_predictor.2.bbox_pred']
-INPUT:
-  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
-  MAX_SIZE_TRAIN: 1333
-  MASK_FORMAT: "bitmask"
-  FORMAT: "RGB"
-TEST:
-  PRECISE_BN:
-    ENABLED: True
-  EVAL_PERIOD: 5000
-OUTPUT_DIR: "output/5perc"

cutler/model_zoo/configs/COCO-Semisupervised/cascade_mask_rcnn_R_50_FPN_60perc.yaml DELETED Viewed

@@ -1,40 +0,0 @@
-_BASE_: "../Base-RCNN-FPN.yaml"
-MODEL:
-  PIXEL_MEAN: [123.675, 116.280, 103.530]
-  PIXEL_STD: [58.395, 57.120, 57.375]
-  WEIGHTS: "http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_cascade_final.pth"
-  MASK_ON: True
-  BACKBONE:
-    FREEZE_AT: 0
-  RESNETS:
-    DEPTH: 50
-    NORM: "SyncBN"
-    STRIDE_IN_1X1: False
-  FPN:
-    NORM: "SyncBN"
-  ROI_BOX_HEAD:
-    CLS_AGNOSTIC_BBOX_REG: True
-  ROI_HEADS:
-    NAME: CustomCascadeROIHeads
-  RPN:
-    POST_NMS_TOPK_TRAIN: 2000
-DATASETS:
-  TRAIN: ("coco_semi_60perc",)
-  TEST: ("coco_2017_val",)
-SOLVER:
-  IMS_PER_BATCH: 16
-  BASE_LR: 0.02
-  STEPS: (36000, 48000)
-  MAX_ITER: 54000
-  BASE_LR_MULTIPLIER: 2
-  BASE_LR_MULTIPLIER_NAMES: ['roi_heads.mask_head.predictor', 'roi_heads.box_predictor.0.cls_score', 'roi_heads.box_predictor.0.bbox_pred', 'roi_heads.box_predictor.1.cls_score', 'roi_heads.box_predictor.1.bbox_pred', 'roi_heads.box_predictor.2.cls_score', 'roi_heads.box_predictor.2.bbox_pred']
-INPUT:
-  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
-  MAX_SIZE_TRAIN: 1333
-  MASK_FORMAT: "bitmask"
-  FORMAT: "RGB"
-TEST:
-  PRECISE_BN:
-    ENABLED: True
-  EVAL_PERIOD: 5000
-OUTPUT_DIR: "output/60perc"

cutler/model_zoo/configs/COCO-Semisupervised/cascade_mask_rcnn_R_50_FPN_80perc.yaml DELETED Viewed

@@ -1,40 +0,0 @@
-_BASE_: "../Base-RCNN-FPN.yaml"
-MODEL:
-  PIXEL_MEAN: [123.675, 116.280, 103.530]
-  PIXEL_STD: [58.395, 57.120, 57.375]
-  WEIGHTS: "http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_cascade_final.pth"
-  MASK_ON: True
-  BACKBONE:
-    FREEZE_AT: 0
-  RESNETS:
-    DEPTH: 50
-    NORM: "SyncBN"
-    STRIDE_IN_1X1: False
-  FPN:
-    NORM: "SyncBN"
-  ROI_BOX_HEAD:
-    CLS_AGNOSTIC_BBOX_REG: True
-  ROI_HEADS:
-    NAME: CustomCascadeROIHeads
-  RPN:
-    POST_NMS_TOPK_TRAIN: 2000
-DATASETS:
-  TRAIN: ("coco_semi_80perc",)
-  TEST: ("coco_2017_val",)
-SOLVER:
-  IMS_PER_BATCH: 16
-  BASE_LR: 0.02
-  STEPS: (48000, 64000)
-  MAX_ITER: 72000
-  BASE_LR_MULTIPLIER: 2
-  BASE_LR_MULTIPLIER_NAMES: ['roi_heads.mask_head.predictor', 'roi_heads.box_predictor.0.cls_score', 'roi_heads.box_predictor.0.bbox_pred', 'roi_heads.box_predictor.1.cls_score', 'roi_heads.box_predictor.1.bbox_pred', 'roi_heads.box_predictor.2.cls_score', 'roi_heads.box_predictor.2.bbox_pred']
-INPUT:
-  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
-  MAX_SIZE_TRAIN: 1333
-  MASK_FORMAT: "bitmask"
-  FORMAT: "RGB"
-TEST:
-  PRECISE_BN:
-    ENABLED: True
-  EVAL_PERIOD: 5000
-OUTPUT_DIR: "output/80perc"

cutler/model_zoo/configs/CutLER-ImageNet/cascade_mask_rcnn_R_50_FPN.yaml DELETED Viewed

@@ -1,61 +0,0 @@
-_BASE_: "../Base-RCNN-FPN.yaml"
-DATALOADER:
-  COPY_PASTE: True
-  COPY_PASTE_RATE: 1.0
-  VISUALIZE_COPY_PASTE: False
-  COPY_PASTE_RANDOM_NUM: True
-  COPY_PASTE_MIN_RATIO: 0.3
-  COPY_PASTE_MAX_RATIO: 1.0
-  NUM_WORKERS: 0
-MODEL:
-  PIXEL_MEAN: [123.675, 116.280, 103.530]
-  PIXEL_STD: [58.395, 57.120, 57.375]
-  WEIGHTS: 'http://dl.fbaipublicfiles.com/cutler/checkpoints/dino_RN50_pretrain_d2_format.pkl'
-  MASK_ON: True
-  BACKBONE:
-    FREEZE_AT: 0
-  RESNETS:
-    DEPTH: 50
-    NORM: "SyncBN"
-    STRIDE_IN_1X1: False
-  FPN:
-    NORM: "SyncBN"
-  ROI_BOX_HEAD:
-    CLS_AGNOSTIC_BBOX_REG: True
-  ROI_HEADS:
-    NAME: CustomCascadeROIHeads
-    NUM_CLASSES: 1
-    SCORE_THRESH_TEST: 0.0
-    POSITIVE_FRACTION: 0.25
-    USE_DROPLOSS: True
-    DROPLOSS_IOU_THRESH: 0.01
-  RPN:
-    POST_NMS_TOPK_TRAIN: 4000
-    NMS_THRESH: 0.65
-DATASETS:
-  TRAIN: ("imagenet_train",)
-SOLVER:
-  IMS_PER_BATCH: 16
-  BASE_LR: 0.01
-  WEIGHT_DECAY: 0.00005
-  STEPS: (80000,)
-  MAX_ITER: 160000
-  GAMMA: 0.02
-  CLIP_GRADIENTS:
-    CLIP_TYPE: norm
-    CLIP_VALUE: 1.0
-    ENABLED: true
-    NORM_TYPE: 2.0
-  AMP:
-    ENABLED: True
-INPUT:
-  MIN_SIZE_TRAIN: (240, 320, 480, 640, 672, 704, 736, 768, 800, 1024)
-  MAX_SIZE_TRAIN: 1333
-  MASK_FORMAT: "bitmask"
-  FORMAT: "RGB"
-TEST:
-  PRECISE_BN:
-    ENABLED: True
-    NUM_ITER: 200
-  DETECTIONS_PER_IMAGE: 100
-OUTPUT_DIR: "output/"

cutler/model_zoo/configs/CutLER-ImageNet/cascade_mask_rcnn_R_50_FPN_demo.yaml DELETED Viewed

@@ -1,62 +0,0 @@
-_BASE_: "../Base-RCNN-FPN.yaml"
-DATALOADER:
-  COPY_PASTE: True
-  COPY_PASTE_RATE: 1.0
-  VISUALIZE_COPY_PASTE: False
-  COPY_PASTE_RANDOM_NUM: True
-  COPY_PASTE_MIN_RATIO: 0.3
-  COPY_PASTE_MAX_RATIO: 1.0
-  NUM_WORKERS: 0
-MODEL:
-  PIXEL_MEAN: [123.675, 116.280, 103.530]
-  PIXEL_STD: [58.395, 57.120, 57.375]
-  WEIGHTS: 'http://dl.fbaipublicfiles.com/cutler/checkpoints/dino_RN50_pretrain_d2_format.pkl'
-  MASK_ON: True
-  BACKBONE:
-    FREEZE_AT: 0
-  RESNETS:
-    DEPTH: 50
-    NORM: "SyncBN"
-    STRIDE_IN_1X1: False
-  FPN:
-    NORM: "SyncBN"
-  ROI_BOX_HEAD:
-    CLS_AGNOSTIC_BBOX_REG: True
-  ROI_HEADS:
-    NAME: CustomCascadeROIHeads
-    NUM_CLASSES: 1
-    SCORE_THRESH_TEST: 0.0
-    POSITIVE_FRACTION: 0.25
-    USE_DROPLOSS: True
-    DROPLOSS_IOU_THRESH: 0.01
-  RPN:
-    POST_NMS_TOPK_TRAIN: 4000
-    NMS_THRESH: 0.65
-DATASETS:
-  TRAIN: ("imagenet_train",)
-  TEST: ("imagenet_train",)
-SOLVER:
-  IMS_PER_BATCH: 16
-  BASE_LR: 0.01
-  WEIGHT_DECAY: 0.00005
-  STEPS: (80000,)
-  MAX_ITER: 160000
-  GAMMA: 0.02
-  CLIP_GRADIENTS:
-    CLIP_TYPE: norm
-    CLIP_VALUE: 1.0
-    ENABLED: true
-    NORM_TYPE: 2.0
-  AMP:
-    ENABLED: True
-INPUT:
-  MIN_SIZE_TRAIN: (240, 320, 480, 640, 672, 704, 736, 768, 800, 1024)
-  MAX_SIZE_TRAIN: 1333
-  MASK_FORMAT: "bitmask"
-  FORMAT: "RGB"
-TEST:
-  PRECISE_BN:
-    ENABLED: True
-    NUM_ITER: 200
-  DETECTIONS_PER_IMAGE: 100
-OUTPUT_DIR: "output/"

cutler/model_zoo/configs/CutLER-ImageNet/cascade_mask_rcnn_R_50_FPN_self_train.yaml DELETED Viewed

@@ -1,60 +0,0 @@
-_BASE_: "../Base-RCNN-FPN.yaml"
-DATALOADER:
-  COPY_PASTE: True
-  COPY_PASTE_RATE: 1.0
-  VISUALIZE_COPY_PASTE: False
-  COPY_PASTE_RANDOM_NUM: True
-  COPY_PASTE_MIN_RATIO: 0.5
-  COPY_PASTE_MAX_RATIO: 1.0
-  NUM_WORKERS: 2
-MODEL:
-  PIXEL_MEAN: [123.675, 116.280, 103.530]
-  PIXEL_STD: [58.395, 57.120, 57.375]
-  WEIGHTS: 'http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_cascade_r1.pth' # round 1
-  # WEIGHTS: 'http://dl.fbaipublicfiles.com/cutler/checkpoints/cutler_cascade_r2.pth' # round 2
-  MASK_ON: True
-  BACKBONE:
-    FREEZE_AT: 0
-  RESNETS:
-    DEPTH: 50
-    NORM: "SyncBN"
-    STRIDE_IN_1X1: False
-  FPN:
-    NORM: "SyncBN"
-  ROI_BOX_HEAD:
-    CLS_AGNOSTIC_BBOX_REG: True
-  ROI_HEADS:
-    NAME: CustomCascadeROIHeads
-    NUM_CLASSES: 1
-    SCORE_THRESH_TEST: 0.0
-    POSITIVE_FRACTION: 0.25
-    USE_DROPLOSS: False
-    DROPLOSS_IOU_THRESH: 0.01
-DATASETS:
-  TRAIN: ("imagenet_train_r1",) # round 1
-  # TRAIN: ("imagenet_train_r2",) # round 2
-SOLVER:
-  IMS_PER_BATCH: 16
-  BASE_LR: 0.005
-  STEPS: (79999,)
-  MAX_ITER: 80000
-  GAMMA: 1.0
-  CLIP_GRADIENTS:
-    CLIP_TYPE: norm
-    CLIP_VALUE: 1.0
-    ENABLED: true
-    NORM_TYPE: 2.0
-  AMP:
-    ENABLED: True
-INPUT:
-  MIN_SIZE_TRAIN: (240, 320, 480, 640, 672, 704, 736, 768, 800, 1024)
-  MAX_SIZE_TRAIN: 1333
-  MASK_FORMAT: "bitmask"
-  FORMAT: "RGB"
-TEST:
-  PRECISE_BN:
-    ENABLED: True
-    NUM_ITER: 200
-  DETECTIONS_PER_IMAGE: 100
-OUTPUT_DIR: "output/self-train-r1/" # round 1
-# OUTPUT_DIR: "output/self-train-r2/" # round 2

cutler/model_zoo/configs/CutLER-ImageNet/mask_rcnn_R_50_FPN.yaml DELETED Viewed

@@ -1,52 +0,0 @@
-_BASE_: "../Base-RCNN-FPN.yaml"
-DATALOADER:
-  COPY_PASTE: True
-  COPY_PASTE_RATE: 1.0
-  VISUALIZE_COPY_PASTE: False
-  COPY_PASTE_RANDOM_NUM: True
-  COPY_PASTE_MIN_RATIO: 0.3
-  COPY_PASTE_MAX_RATIO: 1.0
-MODEL:
-  PIXEL_MEAN: [123.675, 116.280, 103.530]
-  PIXEL_STD: [58.395, 57.120, 57.375]
-  WEIGHTS: 'http://dl.fbaipublicfiles.com/cutler/checkpoints/dino_RN50_pretrain_d2_format.pkl'
-  MASK_ON: True
-  BACKBONE:
-    FREEZE_AT: 0
-  RESNETS:
-    DEPTH: 50
-    NORM: "SyncBN"
-    STRIDE_IN_1X1: False
-  FPN:
-    NORM: "SyncBN"
-  ROI_HEADS:
-    NAME: "CustomStandardROIHeads"
-    NUM_CLASSES: 1
-    SCORE_THRESH_TEST: 0.0
-    USE_DROPLOSS: True
-    DROPLOSS_IOU_THRESH: 0.01
-  RPN:
-    POST_NMS_TOPK_TRAIN: 4000
-    NMS_THRESH: 0.65
-DATASETS:
-  TRAIN: ("imagenet_train",)
-SOLVER:
-  IMS_PER_BATCH: 16
-  BASE_LR: 0.01
-  WEIGHT_DECAY: 0.00005
-  STEPS: (80000,)
-  MAX_ITER: 160000
-  CLIP_GRADIENTS:
-    CLIP_TYPE: norm
-    CLIP_VALUE: 1.0
-    ENABLED: true
-    NORM_TYPE: 2.0
-INPUT:
-  MIN_SIZE_TRAIN: (240, 320, 480, 640, 672, 704, 736, 768, 800, 1024)
-  MAX_SIZE_TRAIN: 1333
-  MASK_FORMAT: "bitmask"
-  FORMAT: "RGB"
-TEST:
-  PRECISE_BN:
-    ENABLED: True
-OUTPUT_DIR: "output/"

cutler/modeling/__init__.py DELETED Viewed

@@ -1,16 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-from .roi_heads import (
-    ROI_HEADS_REGISTRY,
-    ROIHeads,
-    CustomStandardROIHeads,
-    FastRCNNOutputLayers,
-    build_roi_heads,
-)
-from .roi_heads.custom_cascade_rcnn import CustomCascadeROIHeads
-from .roi_heads.fast_rcnn import FastRCNNOutputLayers
-from .meta_arch.rcnn import GeneralizedRCNN, ProposalNetwork
-from .meta_arch.build import build_model
-_EXCLUDE = {"ShapeSpec"}
-__all__ = [k for k in globals().keys() if k not in _EXCLUDE and not k.startswith("_")]

cutler/modeling/meta_arch/__init__.py DELETED Viewed

@@ -1,7 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# Modified by XuDong Wang from https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/meta_arch/__init__.py
-from .build import META_ARCH_REGISTRY, build_model  # isort:skip
-__all__ = list(globals().keys())

cutler/modeling/meta_arch/build.py DELETED Viewed

@@ -1,27 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# Modified by XuDong Wang from https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/meta_arch/build.py
-import torch
-from detectron2.utils.logger import _log_api_usage
-from detectron2.utils.registry import Registry
-META_ARCH_REGISTRY = Registry("META_ARCH")  # noqa F401 isort:skip
-META_ARCH_REGISTRY.__doc__ = """
-Registry for meta-architectures, i.e. the whole model.
-The registered object will be called with `obj(cfg)`
-and expected to return a `nn.Module` object.
-"""
-def build_model(cfg):
-    """
-    Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``.
-    Note that it does not load any weights from ``cfg``.
-    """
-    meta_arch = cfg.MODEL.META_ARCHITECTURE
-    model = META_ARCH_REGISTRY.get(meta_arch)(cfg)
-    model.to(torch.device(cfg.MODEL.DEVICE))
-    _log_api_usage("modeling.meta_arch." + meta_arch)
-    return model

cutler/modeling/meta_arch/rcnn.py DELETED Viewed

@@ -1,344 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# Modified by XuDong Wang from https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/meta_arch/rcnn.py
-import logging
-import numpy as np
-from typing import Dict, List, Optional, Tuple
-import torch
-from torch import nn
-from detectron2.config import configurable
-from detectron2.data.detection_utils import convert_image_to_rgb
-from detectron2.layers import move_device_like
-from detectron2.structures import ImageList, Instances
-from detectron2.utils.events import get_event_storage
-from detectron2.utils.logger import log_first_n
-from detectron2.modeling.backbone import Backbone, build_backbone
-from detectron2.modeling.postprocessing import detector_postprocess
-from detectron2.modeling.proposal_generator import build_proposal_generator
-from ..roi_heads import build_roi_heads
-from .build import META_ARCH_REGISTRY
-__all__ = ["GeneralizedRCNN", "ProposalNetwork"]
-@META_ARCH_REGISTRY.register()
-class GeneralizedRCNN(nn.Module):
-    """
-    Generalized R-CNN. Any models that contains the following three components:
-    1. Per-image feature extraction (aka backbone)
-    2. Region proposal generation
-    3. Per-region feature extraction and prediction
-    """
-    @configurable
-    def __init__(
-        self,
-        *,
-        backbone: Backbone,
-        proposal_generator: nn.Module,
-        roi_heads: nn.Module,
-        pixel_mean: Tuple[float],
-        pixel_std: Tuple[float],
-        input_format: Optional[str] = None,
-        vis_period: int = 0,
-    ):
-        """
-        Args:
-            backbone: a backbone module, must follow detectron2's backbone interface
-            proposal_generator: a module that generates proposals using backbone features
-            roi_heads: a ROI head that performs per-region computation
-            pixel_mean, pixel_std: list or tuple with #channels element, representing
-                the per-channel mean and std to be used to normalize the input image
-            input_format: describe the meaning of channels of input. Needed by visualization
-            vis_period: the period to run visualization. Set to 0 to disable.
-        """
-        super().__init__()
-        self.backbone = backbone
-        self.proposal_generator = proposal_generator
-        self.roi_heads = roi_heads
-        self.input_format = input_format
-        self.vis_period = vis_period
-        if vis_period > 0:
-            assert input_format is not None, "input_format is required for visualization!"
-        self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
-        self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
-        assert (
-            self.pixel_mean.shape == self.pixel_std.shape
-        ), f"{self.pixel_mean} and {self.pixel_std} have different shapes!"
-    @classmethod
-    def from_config(cls, cfg):
-        backbone = build_backbone(cfg)
-        return {
-            "backbone": backbone,
-            "proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
-            "roi_heads": build_roi_heads(cfg, backbone.output_shape()),
-            "input_format": cfg.INPUT.FORMAT,
-            "vis_period": cfg.VIS_PERIOD,
-            "pixel_mean": cfg.MODEL.PIXEL_MEAN,
-            "pixel_std": cfg.MODEL.PIXEL_STD,
-        }
-    @property
-    def device(self):
-        return self.pixel_mean.device
-    def _move_to_current_device(self, x):
-        return move_device_like(x, self.pixel_mean)
-    def visualize_training(self, batched_inputs, proposals):
-        """
-        A function used to visualize images and proposals. It shows ground truth
-        bounding boxes on the original image and up to 20 top-scoring predicted
-        object proposals on the original image. Users can implement different
-        visualization functions for different models.
-        Args:
-            batched_inputs (list): a list that contains input to the model.
-            proposals (list): a list that contains predicted proposals. Both
-                batched_inputs and proposals should have the same length.
-        """
-        from detectron2.utils.visualizer import Visualizer
-        storage = get_event_storage()
-        max_vis_prop = 20
-        for input, prop in zip(batched_inputs, proposals):
-            img = input["image"]
-            img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format)
-            v_gt = Visualizer(img, None)
-            v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes)
-            anno_img = v_gt.get_image()
-            box_size = min(len(prop.proposal_boxes), max_vis_prop)
-            v_pred = Visualizer(img, None)
-            v_pred = v_pred.overlay_instances(
-                boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy()
-            )
-            prop_img = v_pred.get_image()
-            vis_img = np.concatenate((anno_img, prop_img), axis=1)
-            vis_img = vis_img.transpose(2, 0, 1)
-            vis_name = "Left: GT bounding boxes;  Right: Predicted proposals"
-            storage.put_image(vis_name, vis_img)
-            break  # only visualize one image in a batch
-    def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
-        """
-        Args:
-            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
-                Each item in the list contains the inputs for one image.
-                For now, each item in the list is a dict that contains:
-                * image: Tensor, image in (C, H, W) format.
-                * instances (optional): groundtruth :class:`Instances`
-                * proposals (optional): :class:`Instances`, precomputed proposals.
-                Other information that's included in the original dicts, such as:
-                * "height", "width" (int): the output resolution of the model, used in inference.
-                  See :meth:`postprocess` for details.
-        Returns:
-            list[dict]:
-                Each dict is the output for one input image.
-                The dict contains one key "instances" whose value is a :class:`Instances`.
-                The :class:`Instances` object has the following keys:
-                "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints"
-        """
-        if not self.training:
-            return self.inference(batched_inputs)
-        images = self.preprocess_image(batched_inputs)
-        if "instances" in batched_inputs[0]:
-            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
-        else:
-            gt_instances = None
-        features = self.backbone(images.tensor)
-        if self.proposal_generator is not None:
-            proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
-        else:
-            assert "proposals" in batched_inputs[0]
-            proposals = [x["proposals"].to(self.device) for x in batched_inputs]
-            proposal_losses = {}
-        _, detector_losses = self.roi_heads(images, features, proposals, gt_instances)
-        if self.vis_period > 0:
-            storage = get_event_storage()
-            if storage.iter % self.vis_period == 0:
-                self.visualize_training(batched_inputs, proposals)
-        losses = {}
-        losses.update(detector_losses)
-        losses.update(proposal_losses)
-        return losses
-    def inference(
-        self,
-        batched_inputs: List[Dict[str, torch.Tensor]],
-        detected_instances: Optional[List[Instances]] = None,
-        do_postprocess: bool = True,
-    ):
-        """
-        Run inference on the given inputs.
-        Args:
-            batched_inputs (list[dict]): same as in :meth:`forward`
-            detected_instances (None or list[Instances]): if not None, it
-                contains an `Instances` object per image. The `Instances`
-                object contains "pred_boxes" and "pred_classes" which are
-                known boxes in the image.
-                The inference will then skip the detection of bounding boxes,
-                and only predict other per-ROI outputs.
-            do_postprocess (bool): whether to apply post-processing on the outputs.
-        Returns:
-            When do_postprocess=True, same as in :meth:`forward`.
-            Otherwise, a list[Instances] containing raw network outputs.
-        """
-        assert not self.training
-        images = self.preprocess_image(batched_inputs)
-        features = self.backbone(images.tensor)
-        if detected_instances is None:
-            if self.proposal_generator is not None:
-                proposals, _ = self.proposal_generator(images, features, None)
-            else:
-                assert "proposals" in batched_inputs[0]
-                proposals = [x["proposals"].to(self.device) for x in batched_inputs]
-            results, _ = self.roi_heads(images, features, proposals, None)
-        else:
-            detected_instances = [x.to(self.device) for x in detected_instances]
-            results = self.roi_heads.forward_with_given_boxes(features, detected_instances)
-        if do_postprocess:
-            assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess."
-            return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
-        else:
-            return results
-    def preprocess_image(self, batched_inputs: List[Dict[str, torch.Tensor]]):
-        """
-        Normalize, pad and batch the input images.
-        """
-        images = [self._move_to_current_device(x["image"]) for x in batched_inputs]
-        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
-        images = ImageList.from_tensors(
-            images,
-            self.backbone.size_divisibility,
-            padding_constraints=self.backbone.padding_constraints,
-        )
-        return images
-    @staticmethod
-    def _postprocess(instances, batched_inputs: List[Dict[str, torch.Tensor]], image_sizes):
-        """
-        Rescale the output instances to the target size.
-        """
-        # note: private function; subject to changes
-        processed_results = []
-        for results_per_image, input_per_image, image_size in zip(
-            instances, batched_inputs, image_sizes
-        ):
-            height = input_per_image.get("height", image_size[0])
-            width = input_per_image.get("width", image_size[1])
-            r = detector_postprocess(results_per_image, height, width)
-            processed_results.append({"instances": r})
-        return processed_results
-@META_ARCH_REGISTRY.register()
-class ProposalNetwork(nn.Module):
-    """
-    A meta architecture that only predicts object proposals.
-    """
-    @configurable
-    def __init__(
-        self,
-        *,
-        backbone: Backbone,
-        proposal_generator: nn.Module,
-        pixel_mean: Tuple[float],
-        pixel_std: Tuple[float],
-    ):
-        """
-        Args:
-            backbone: a backbone module, must follow detectron2's backbone interface
-            proposal_generator: a module that generates proposals using backbone features
-            pixel_mean, pixel_std: list or tuple with #channels element, representing
-                the per-channel mean and std to be used to normalize the input image
-        """
-        super().__init__()
-        self.backbone = backbone
-        self.proposal_generator = proposal_generator
-        self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
-        self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
-    @classmethod
-    def from_config(cls, cfg):
-        backbone = build_backbone(cfg)
-        return {
-            "backbone": backbone,
-            "proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
-            "pixel_mean": cfg.MODEL.PIXEL_MEAN,
-            "pixel_std": cfg.MODEL.PIXEL_STD,
-        }
-    @property
-    def device(self):
-        return self.pixel_mean.device
-    def _move_to_current_device(self, x):
-        return move_device_like(x, self.pixel_mean)
-    def forward(self, batched_inputs):
-        """
-        Args:
-            Same as in :class:`GeneralizedRCNN.forward`
-        Returns:
-            list[dict]:
-                Each dict is the output for one input image.
-                The dict contains one key "proposals" whose value is a
-                :class:`Instances` with keys "proposal_boxes" and "objectness_logits".
-        """
-        images = [self._move_to_current_device(x["image"]) for x in batched_inputs]
-        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
-        images = ImageList.from_tensors(
-            images,
-            self.backbone.size_divisibility,
-            padding_constraints=self.backbone.padding_constraints,
-        )
-        features = self.backbone(images.tensor)
-        if "instances" in batched_inputs[0]:
-            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
-        elif "targets" in batched_inputs[0]:
-            log_first_n(
-                logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10
-            )
-            gt_instances = [x["targets"].to(self.device) for x in batched_inputs]
-        else:
-            gt_instances = None
-        proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
-        # In training, the proposals are not useful at all but we generate them anyway.
-        # This makes RPN-only models about 5% slower.
-        if self.training:
-            return proposal_losses
-        processed_results = []
-        for results_per_image, input_per_image, image_size in zip(
-            proposals, batched_inputs, images.image_sizes
-        ):
-            height = input_per_image.get("height", image_size[0])
-            width = input_per_image.get("width", image_size[1])
-            r = detector_postprocess(results_per_image, height, width)
-            processed_results.append({"proposals": r})
-        return processed_results

cutler/modeling/roi_heads/__init__.py DELETED Viewed

@@ -1,16 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-from .roi_heads import (
-    ROI_HEADS_REGISTRY,
-    ROIHeads,
-    Res5ROIHeads,
-    CustomStandardROIHeads,
-    build_roi_heads,
-    select_foreground_proposals,
-)
-from .custom_cascade_rcnn import CustomCascadeROIHeads
-from .fast_rcnn import FastRCNNOutputLayers
-from . import custom_cascade_rcnn  # isort:skip
-__all__ = list(globals().keys())

cutler/modeling/roi_heads/custom_cascade_rcnn.py DELETED Viewed

@@ -1,338 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# Modified by XuDong Wang from https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/roi_heads/cascade_rcnn.py
-from typing import List
-import torch
-from torch import nn
-from torch.autograd.function import Function
-from detectron2.config import configurable
-from detectron2.layers import ShapeSpec
-from detectron2.structures import Boxes, pairwise_iou
-from structures import pairwise_iou_max_scores
-from detectron2.structures import Instances
-from detectron2.utils.events import get_event_storage
-from detectron2.modeling.box_regression import Box2BoxTransform
-from detectron2.modeling.matcher import Matcher
-from detectron2.modeling.poolers import ROIPooler
-from detectron2.modeling.roi_heads.box_head import build_box_head
-from .fast_rcnn import FastRCNNOutputLayers, fast_rcnn_inference
-from .roi_heads import ROI_HEADS_REGISTRY, CustomStandardROIHeads
-import torch.nn.functional as F
-class _ScaleGradient(Function):
-    @staticmethod
-    def forward(ctx, input, scale):
-        ctx.scale = scale
-        return input
-    @staticmethod
-    def backward(ctx, grad_output):
-        return grad_output * ctx.scale, None
-@ROI_HEADS_REGISTRY.register()
-class CustomCascadeROIHeads(CustomStandardROIHeads):
-    """
-    The ROI heads that implement :paper:`Cascade R-CNN`.
-    """
-    @configurable
-    def __init__(
-        self,
-        *,
-        box_in_features: List[str],
-        box_pooler: ROIPooler,
-        box_heads: List[nn.Module],
-        box_predictors: List[nn.Module],
-        proposal_matchers: List[Matcher],
-        **kwargs,
-    ):
-        """
-        NOTE: this interface is experimental.
-        Args:
-            box_pooler (ROIPooler): pooler that extracts region features from given boxes
-            box_heads (list[nn.Module]): box head for each cascade stage
-            box_predictors (list[nn.Module]): box predictor for each cascade stage
-            proposal_matchers (list[Matcher]): matcher with different IoU thresholds to
-                match boxes with ground truth for each stage. The first matcher matches
-                RPN proposals with ground truth, the other matchers use boxes predicted
-                by the previous stage as proposals and match them with ground truth.
-        """
-        assert "proposal_matcher" not in kwargs, (
-            "CustomCascadeROIHeads takes 'proposal_matchers=' for each stage instead "
-            "of one 'proposal_matcher='."
-        )
-        # The first matcher matches RPN proposals with ground truth, done in the base class
-        kwargs["proposal_matcher"] = proposal_matchers[0]
-        num_stages = self.num_cascade_stages = len(box_heads)
-        box_heads = nn.ModuleList(box_heads)
-        box_predictors = nn.ModuleList(box_predictors)
-        assert len(box_predictors) == num_stages, f"{len(box_predictors)} != {num_stages}!"
-        assert len(proposal_matchers) == num_stages, f"{len(proposal_matchers)} != {num_stages}!"
-        super().__init__(
-            box_in_features=box_in_features,
-            box_pooler=box_pooler,
-            box_head=box_heads,
-            box_predictor=box_predictors,
-            **kwargs,
-        )
-        self.proposal_matchers = proposal_matchers
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        ret = super().from_config(cfg, input_shape)
-        ret.pop("proposal_matcher")
-        return ret
-    @classmethod
-    def _init_box_head(cls, cfg, input_shape):
-        # fmt: off
-        in_features              = cfg.MODEL.ROI_HEADS.IN_FEATURES
-        pooler_resolution        = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
-        pooler_scales            = tuple(1.0 / input_shape[k].stride for k in in_features)
-        sampling_ratio           = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
-        pooler_type              = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
-        cascade_bbox_reg_weights = cfg.MODEL.ROI_BOX_CASCADE_HEAD.BBOX_REG_WEIGHTS
-        cascade_ious             = cfg.MODEL.ROI_BOX_CASCADE_HEAD.IOUS
-        assert len(cascade_bbox_reg_weights) == len(cascade_ious)
-        assert cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG,  \
-            "CustomCascadeROIHeads only support class-agnostic regression now!"
-        assert cascade_ious[0] == cfg.MODEL.ROI_HEADS.IOU_THRESHOLDS[0]
-        # fmt: on
-        in_channels = [input_shape[f].channels for f in in_features]
-        # Check all channel counts are equal
-        assert len(set(in_channels)) == 1, in_channels
-        in_channels = in_channels[0]
-        box_pooler = ROIPooler(
-            output_size=pooler_resolution,
-            scales=pooler_scales,
-            sampling_ratio=sampling_ratio,
-            pooler_type=pooler_type,
-        )
-        pooled_shape = ShapeSpec(
-            channels=in_channels, width=pooler_resolution, height=pooler_resolution
-        )
-        box_heads, box_predictors, proposal_matchers = [], [], []
-        for match_iou, bbox_reg_weights in zip(cascade_ious, cascade_bbox_reg_weights):
-            box_head = build_box_head(cfg, pooled_shape)
-            box_heads.append(box_head)
-            box_predictors.append(
-                FastRCNNOutputLayers(
-                    cfg,
-                    box_head.output_shape,
-                    box2box_transform=Box2BoxTransform(weights=bbox_reg_weights),
-                )
-            )
-            proposal_matchers.append(Matcher([match_iou], [0, 1], allow_low_quality_matches=False))
-        return {
-            "box_in_features": in_features,
-            "box_pooler": box_pooler,
-            "box_heads": box_heads,
-            "box_predictors": box_predictors,
-            "proposal_matchers": proposal_matchers,
-        }
-    def forward(self, images, features, proposals, targets=None):
-        del images
-        if self.training:
-            proposals = self.label_and_sample_proposals(proposals, targets)
-        if self.training:
-            # Need targets to box head
-            losses = self._forward_box(features, proposals, targets)
-            losses.update(self._forward_mask(features, proposals))
-            losses.update(self._forward_keypoint(features, proposals))
-            return proposals, losses
-        else:
-            pred_instances = self._forward_box(features, proposals)
-            pred_instances = self.forward_with_given_boxes(features, pred_instances)
-            return pred_instances, {}
-    def _forward_box(self, features, proposals, targets=None):
-        """
-        Args:
-            features, targets: the same as in
-                Same as in :meth:`ROIHeads.forward`.
-            proposals (list[Instances]): the per-image object proposals with
-                their matching ground truth.
-                Each has fields "proposal_boxes", and "objectness_logits",
-                "gt_classes", "gt_boxes".
-        """
-        features = [features[f] for f in self.box_in_features]
-        head_outputs = []  # (predictor, predictions, proposals)
-        prev_pred_boxes = None
-        image_sizes = [x.image_size for x in proposals]
-        for k in range(self.num_cascade_stages):
-            if k > 0:
-                # The output boxes of the previous stage are used to create the input
-                # proposals of the next stage.
-                proposals = self._create_proposals_from_boxes(prev_pred_boxes, image_sizes)
-                if self.training:
-                    proposals = self._match_and_label_boxes(proposals, k, targets)
-            predictions = self._run_stage(features, proposals, k)
-            prev_pred_boxes = self.box_predictor[k].predict_boxes(predictions, proposals)
-            head_outputs.append((self.box_predictor[k], predictions, proposals))
-        no_gt_found = False
-        if self.training:
-            losses = {}
-            storage = get_event_storage()
-            for stage, (predictor, predictions, proposals) in enumerate(head_outputs):
-                no_gt_found = False
-                with storage.name_scope("stage{}".format(stage)):
-                    if self.use_droploss:
-                        try:
-                            box_num_list = [len(x.gt_boxes) for x in proposals]
-                            gt_num_list = [torch.unique(x.gt_boxes.tensor[:100], dim=0).size()[0] for x in proposals]
-                        except:
-                            box_num_list = [0 for x in proposals]
-                            gt_num_list = [0 for x in proposals]
-                            no_gt_found = True
-                        if not no_gt_found:
-                            # NOTE: confidence score
-                            prediction_score, predictions_delta = predictions[0], predictions[1]
-                            prediction_score = F.softmax(prediction_score, dim=1)[:,0]
-                            # NOTE: maximum overlapping with GT (IoU)
-                            proposal_boxes = Boxes.cat([x.proposal_boxes for x in proposals])
-                            predictions_bbox = predictor.box2box_transform.apply_deltas(predictions_delta, proposal_boxes.tensor)
-                            idx_start = 0
-                            iou_max_list = []
-                            for idx, x in enumerate(proposals):
-                                idx_end = idx_start + box_num_list[idx]
-                                iou_max_list.append(pairwise_iou_max_scores(predictions_bbox[idx_start:idx_end], x.gt_boxes[:gt_num_list[idx]].tensor))
-                                idx_start = idx_end
-                            iou_max = torch.cat(iou_max_list, dim=0)
-                            # NOTE: get the weight of each proposal
-                            weights = iou_max.le(self.droploss_iou_thresh).float()
-                            weights = 1 - weights.ge(1.0).float()
-                            stage_losses = predictor.losses(predictions, proposals, weights=weights.detach())
-                        else:
-                            stage_losses = predictor.losses(predictions, proposals)
-                    else:
-                        stage_losses = predictor.losses(predictions, proposals)
-                losses.update({k + "_stage{}".format(stage): v for k, v in stage_losses.items()})
-            return losses
-        else:
-            # Each is a list[Tensor] of length #image. Each tensor is Ri x (K+1)
-            scores_per_stage = [h[0].predict_probs(h[1], h[2]) for h in head_outputs]
-            # Average the scores across heads
-            scores = [
-                sum(list(scores_per_image)) * (1.0 / self.num_cascade_stages)
-                for scores_per_image in zip(*scores_per_stage)
-            ]
-            # Use the boxes of the last head
-            predictor, predictions, proposals = head_outputs[-1]
-            boxes = predictor.predict_boxes(predictions, proposals)
-            pred_instances, _ = fast_rcnn_inference(
-                boxes,
-                scores,
-                image_sizes,
-                predictor.test_score_thresh,
-                predictor.test_nms_thresh,
-                predictor.test_topk_per_image,
-            )
-            return pred_instances
-    @torch.no_grad()
-    def _match_and_label_boxes(self, proposals, stage, targets):
-        """
-        Match proposals with groundtruth using the matcher at the given stage.
-        Label the proposals as foreground or background based on the match.
-        Args:
-            proposals (list[Instances]): One Instances for each image, with
-                the field "proposal_boxes".
-            stage (int): the current stage
-            targets (list[Instances]): the ground truth instances
-        Returns:
-            list[Instances]: the same proposals, but with fields "gt_classes" and "gt_boxes"
-        """
-        num_fg_samples, num_bg_samples = [], []
-        for proposals_per_image, targets_per_image in zip(proposals, targets):
-            match_quality_matrix = pairwise_iou(
-                targets_per_image.gt_boxes, proposals_per_image.proposal_boxes
-            )
-            # proposal_labels are 0 or 1
-            matched_idxs, proposal_labels = self.proposal_matchers[stage](match_quality_matrix)
-            if len(targets_per_image) > 0:
-                gt_classes = targets_per_image.gt_classes[matched_idxs]
-                # Label unmatched proposals (0 label from matcher) as background (label=num_classes)
-                gt_classes[proposal_labels == 0] = self.num_classes
-                gt_boxes = targets_per_image.gt_boxes[matched_idxs]
-            else:
-                gt_classes = torch.zeros_like(matched_idxs) + self.num_classes
-                gt_boxes = Boxes(
-                    targets_per_image.gt_boxes.tensor.new_zeros((len(proposals_per_image), 4))
-                )
-            proposals_per_image.gt_classes = gt_classes
-            proposals_per_image.gt_boxes = gt_boxes
-            num_fg_samples.append((proposal_labels == 1).sum().item())
-            num_bg_samples.append(proposal_labels.numel() - num_fg_samples[-1])
-        # Log the number of fg/bg samples in each stage
-        storage = get_event_storage()
-        storage.put_scalar(
-            "stage{}/roi_head/num_fg_samples".format(stage),
-            sum(num_fg_samples) / len(num_fg_samples),
-        )
-        storage.put_scalar(
-            "stage{}/roi_head/num_bg_samples".format(stage),
-            sum(num_bg_samples) / len(num_bg_samples),
-        )
-        return proposals
-    def _run_stage(self, features, proposals, stage):
-        """
-        Args:
-            features (list[Tensor]): #lvl input features to ROIHeads
-            proposals (list[Instances]): #image Instances, with the field "proposal_boxes"
-            stage (int): the current stage
-        Returns:
-            Same output as `FastRCNNOutputLayers.forward()`.
-        """
-        box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals])
-        # The original implementation averages the losses among heads,
-        # but scale up the parameter gradients of the heads.
-        # This is equivalent to adding the losses among heads,
-        # but scale down the gradients on features.
-        if self.training:
-            box_features = _ScaleGradient.apply(box_features, 1.0 / self.num_cascade_stages)
-        box_features = self.box_head[stage](box_features)
-        return self.box_predictor[stage](box_features)
-    def _create_proposals_from_boxes(self, boxes, image_sizes):
-        """
-        Args:
-            boxes (list[Tensor]): per-image predicted boxes, each of shape Ri x 4
-            image_sizes (list[tuple]): list of image shapes in (h, w)
-        Returns:
-            list[Instances]: per-image proposals with the given boxes.
-        """
-        # Just like RPN, the proposals should not have gradients
-        boxes = [Boxes(b.detach()) for b in boxes]
-        proposals = []
-        for boxes_per_image, image_size in zip(boxes, image_sizes):
-            boxes_per_image.clip(image_size)
-            if self.training:
-                # do not filter empty boxes at inference time,
-                # because the scores from each stage need to be aligned and added later
-                boxes_per_image = boxes_per_image[boxes_per_image.nonempty()]
-            prop = Instances(image_size)
-            prop.proposal_boxes = boxes_per_image
-            proposals.append(prop)
-        return proposals

cutler/modeling/roi_heads/fast_rcnn.py DELETED Viewed

@@ -1,587 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# Modified by XuDong Wang from https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/roi_heads/fast_rcnn.py
-import logging
-from typing import Callable, Dict, List, Optional, Tuple, Union
-import torch
-from torch import nn
-from torch.nn import functional as F
-from detectron2.config import configurable
-from detectron2.data.detection_utils import get_fed_loss_cls_weights
-from detectron2.layers import ShapeSpec, batched_nms, cat, cross_entropy, nonzero_tuple
-from detectron2.modeling.box_regression import Box2BoxTransform, _dense_box_regression_loss
-from detectron2.structures import Instances, Boxes
-from detectron2.utils.events import get_event_storage
-from torch.nn import Parameter
-import torch.nn.functional as F
-__all__ = ["fast_rcnn_inference", "FastRCNNOutputLayers"]
-logger = logging.getLogger(__name__)
-"""
-Shape shorthand in this module:
-    N: number of images in the minibatch
-    R: number of ROIs, combined over all images, in the minibatch
-    Ri: number of ROIs in image i
-    K: number of foreground classes. E.g.,there are 80 foreground classes in COCO.
-Naming convention:
-    deltas: refers to the 4-d (dx, dy, dw, dh) deltas that parameterize the box2box
-    transform (see :class:`box_regression.Box2BoxTransform`).
-    pred_class_logits: predicted class scores in [-inf, +inf]; use
-        softmax(pred_class_logits) to estimate P(class).
-    gt_classes: ground-truth classification labels in [0, K], where [0, K) represent
-        foreground object classes and K represents the background class.
-    pred_proposal_deltas: predicted box2box transform deltas for transforming proposals
-        to detection box predictions.
-    gt_proposal_deltas: ground-truth box2box transform deltas
-"""
-def fast_rcnn_inference(
-    boxes: List[torch.Tensor],
-    scores: List[torch.Tensor],
-    image_shapes: List[Tuple[int, int]],
-    score_thresh: float,
-    nms_thresh: float,
-    topk_per_image: int,
-):
-    """
-    Call `fast_rcnn_inference_single_image` for all images.
-    Args:
-        boxes (list[Tensor]): A list of Tensors of predicted class-specific or class-agnostic
-            boxes for each image. Element i has shape (Ri, K * 4) if doing
-            class-specific regression, or (Ri, 4) if doing class-agnostic
-            regression, where Ri is the number of predicted objects for image i.
-            This is compatible with the output of :meth:`FastRCNNOutputLayers.predict_boxes`.
-        scores (list[Tensor]): A list of Tensors of predicted class scores for each image.
-            Element i has shape (Ri, K + 1), where Ri is the number of predicted objects
-            for image i. Compatible with the output of :meth:`FastRCNNOutputLayers.predict_probs`.
-        image_shapes (list[tuple]): A list of (width, height) tuples for each image in the batch.
-        score_thresh (float): Only return detections with a confidence score exceeding this
-            threshold.
-        nms_thresh (float):  The threshold to use for box non-maximum suppression. Value in [0, 1].
-        topk_per_image (int): The number of top scoring detections to return. Set < 0 to return
-            all detections.
-    Returns:
-        instances: (list[Instances]): A list of N instances, one for each image in the batch,
-            that stores the topk most confidence detections.
-        kept_indices: (list[Tensor]): A list of 1D tensor of length of N, each element indicates
-            the corresponding boxes/scores index in [0, Ri) from the input, for image i.
-    """
-    result_per_image = [
-        fast_rcnn_inference_single_image(
-            boxes_per_image, scores_per_image, image_shape, score_thresh, nms_thresh, topk_per_image
-        )
-        for scores_per_image, boxes_per_image, image_shape in zip(scores, boxes, image_shapes)
-    ]
-    return [x[0] for x in result_per_image], [x[1] for x in result_per_image]
-def _log_classification_stats(pred_logits, gt_classes, prefix="fast_rcnn"):
-    """
-    Log the classification metrics to EventStorage.
-    Args:
-        pred_logits: Rx(K+1) logits. The last column is for background class.
-        gt_classes: R labels
-    """
-    num_instances = gt_classes.numel()
-    if num_instances == 0:
-        return
-    pred_classes = pred_logits.argmax(dim=1)
-    bg_class_ind = pred_logits.shape[1] - 1
-    fg_inds = (gt_classes >= 0) & (gt_classes < bg_class_ind)
-    num_fg = fg_inds.nonzero().numel()
-    fg_gt_classes = gt_classes[fg_inds]
-    fg_pred_classes = pred_classes[fg_inds]
-    num_false_negative = (fg_pred_classes == bg_class_ind).nonzero().numel()
-    num_accurate = (pred_classes == gt_classes).nonzero().numel()
-    fg_num_accurate = (fg_pred_classes == fg_gt_classes).nonzero().numel()
-    storage = get_event_storage()
-    storage.put_scalar(f"{prefix}/cls_accuracy", num_accurate / num_instances)
-    if num_fg > 0:
-        storage.put_scalar(f"{prefix}/fg_cls_accuracy", fg_num_accurate / num_fg)
-        storage.put_scalar(f"{prefix}/false_negative", num_false_negative / num_fg)
-def fast_rcnn_inference_single_image(
-    boxes,
-    scores,
-    image_shape: Tuple[int, int],
-    score_thresh: float,
-    nms_thresh: float,
-    topk_per_image: int,
-):
-    """
-    Single-image inference. Return bounding-box detection results by thresholding
-    on scores and applying non-maximum suppression (NMS).
-    Args:
-        Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes
-        per image.
-    Returns:
-        Same as `fast_rcnn_inference`, but for only one image.
-    """
-    valid_mask = torch.isfinite(boxes).all(dim=1) & torch.isfinite(scores).all(dim=1)
-    if not valid_mask.all():
-        boxes = boxes[valid_mask]
-        scores = scores[valid_mask]
-    scores = scores[:, :-1]
-    num_bbox_reg_classes = boxes.shape[1] // 4
-    # Convert to Boxes to use the `clip` function ...
-    boxes = Boxes(boxes.reshape(-1, 4))
-    boxes.clip(image_shape)
-    boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4)  # R x C x 4
-    # 1. Filter results based on detection scores. It can make NMS more efficient
-    #    by filtering out low-confidence detections.
-    filter_mask = scores > score_thresh  # R x K
-    # R' x 2. First column contains indices of the R predictions;
-    # Second column contains indices of classes.
-    filter_inds = filter_mask.nonzero()
-    if num_bbox_reg_classes == 1:
-        boxes = boxes[filter_inds[:, 0], 0]
-    else:
-        boxes = boxes[filter_mask]
-    scores = scores[filter_mask]
-    # 2. Apply NMS for each class independently.
-    keep = batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh)
-    if topk_per_image >= 0:
-        keep = keep[:topk_per_image]
-    boxes, scores, filter_inds = boxes[keep], scores[keep], filter_inds[keep]
-    result = Instances(image_shape)
-    result.pred_boxes = Boxes(boxes)
-    result.scores = scores
-    result.pred_classes = filter_inds[:, 1]
-    return result, filter_inds[:, 0]
-class NormedLinear(nn.Module):
-    def __init__(self, in_features, out_features):
-        super(NormedLinear, self).__init__()
-        self.weight = Parameter(torch.Tensor(in_features, out_features))
-        self.weight.data.uniform_(-1, 1).renorm_(2, 1, 1e-5).mul_(1e5)
-    def forward(self, x):
-        out = F.normalize(x, dim=1).mm(F.normalize(self.weight, dim=0))
-        return out
-class FastRCNNOutputLayers(nn.Module):
-    """
-    Two linear layers for predicting Fast R-CNN outputs:
-    1. proposal-to-detection box regression deltas
-    2. classification scores
-    """
-    @configurable
-    def __init__(
-        self,
-        input_shape: ShapeSpec,
-        *,
-        box2box_transform,
-        num_classes: int,
-        test_score_thresh: float = 0.0,
-        test_nms_thresh: float = 0.5,
-        test_topk_per_image: int = 100,
-        cls_agnostic_bbox_reg: bool = False,
-        smooth_l1_beta: float = 0.0,
-        box_reg_loss_type: str = "smooth_l1",
-        loss_weight: Union[float, Dict[str, float]] = 1.0,
-        use_fed_loss: bool = False,
-        use_sigmoid_ce: bool = False,
-        get_fed_loss_cls_weights: Optional[Callable] = None,
-        fed_loss_num_classes: int = 50,
-    ):
-        """
-        NOTE: this interface is experimental.
-        Args:
-            input_shape (ShapeSpec): shape of the input feature to this module
-            box2box_transform (Box2BoxTransform or Box2BoxTransformRotated):
-            num_classes (int): number of foreground classes
-            test_score_thresh (float): threshold to filter predictions results.
-            test_nms_thresh (float): NMS threshold for prediction results.
-            test_topk_per_image (int): number of top predictions to produce per image.
-            cls_agnostic_bbox_reg (bool): whether to use class agnostic for bbox regression
-            smooth_l1_beta (float): transition point from L1 to L2 loss. Only used if
-                `box_reg_loss_type` is "smooth_l1"
-            box_reg_loss_type (str): Box regression loss type. One of: "smooth_l1", "giou",
-                "diou", "ciou"
-            loss_weight (float|dict): weights to use for losses. Can be single float for weighting
-                all losses, or a dict of individual weightings. Valid dict keys are:
-                    * "loss_cls": applied to classification loss
-                    * "loss_box_reg": applied to box regression loss
-            use_fed_loss (bool): whether to use federated loss which samples additional negative
-                classes to calculate the loss
-            use_sigmoid_ce (bool): whether to calculate the loss using weighted average of binary
-                cross entropy with logits. This could be used together with federated loss
-            get_fed_loss_cls_weights (Callable): a callable which takes dataset name and frequency
-                weight power, and returns the probabilities to sample negative classes for
-                federated loss. The implementation can be found in
-                detectron2/data/detection_utils.py
-            fed_loss_num_classes (int): number of federated classes to keep in total
-        """
-        super().__init__()
-        if isinstance(input_shape, int):  # some backward compatibility
-            input_shape = ShapeSpec(channels=input_shape)
-        self.num_classes = num_classes
-        input_size = input_shape.channels * (input_shape.width or 1) * (input_shape.height or 1)
-        # prediction layer for num_classes foreground classes and one background class (hence + 1)
-        self.cls_score = nn.Linear(input_size, num_classes + 1)
-        nn.init.normal_(self.cls_score.weight, std=0.01)
-        num_bbox_reg_classes = 1 if cls_agnostic_bbox_reg else num_classes
-        box_dim = len(box2box_transform.weights)
-        self.bbox_pred = nn.Linear(input_size, num_bbox_reg_classes * box_dim)
-        nn.init.normal_(self.bbox_pred.weight, std=0.001)
-        for l in [self.cls_score, self.bbox_pred]:
-            nn.init.constant_(l.bias, 0)
-        self.box2box_transform = box2box_transform
-        self.smooth_l1_beta = smooth_l1_beta
-        self.test_score_thresh = test_score_thresh
-        self.test_nms_thresh = test_nms_thresh
-        self.test_topk_per_image = test_topk_per_image
-        self.box_reg_loss_type = box_reg_loss_type
-        if isinstance(loss_weight, float):
-            loss_weight = {"loss_cls": loss_weight, "loss_box_reg": loss_weight}
-        self.loss_weight = loss_weight
-        self.use_fed_loss = use_fed_loss
-        self.use_sigmoid_ce = use_sigmoid_ce
-        self.fed_loss_num_classes = fed_loss_num_classes
-        if self.use_fed_loss:
-            assert self.use_sigmoid_ce, "Please use sigmoid cross entropy loss with federated loss"
-            fed_loss_cls_weights = get_fed_loss_cls_weights()
-            assert (
-                len(fed_loss_cls_weights) == self.num_classes
-            ), "Please check the provided fed_loss_cls_weights. Their size should match num_classes"
-            self.register_buffer("fed_loss_cls_weights", fed_loss_cls_weights)
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {
-            "input_shape": input_shape,
-            "box2box_transform": Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS),
-            # fmt: off
-            "num_classes"               : cfg.MODEL.ROI_HEADS.NUM_CLASSES,
-            "cls_agnostic_bbox_reg"     : cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG,
-            "smooth_l1_beta"            : cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA,
-            "test_score_thresh"         : cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST,
-            "test_nms_thresh"           : cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST,
-            "test_topk_per_image"       : cfg.TEST.DETECTIONS_PER_IMAGE,
-            "box_reg_loss_type"         : cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_TYPE,
-            "loss_weight"               : {"loss_box_reg": cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_WEIGHT},  # noqa
-            "use_fed_loss"              : cfg.MODEL.ROI_BOX_HEAD.USE_FED_LOSS,
-            "use_sigmoid_ce"            : cfg.MODEL.ROI_BOX_HEAD.USE_SIGMOID_CE,
-            "get_fed_loss_cls_weights"  : lambda: get_fed_loss_cls_weights(dataset_names=cfg.DATASETS.TRAIN, freq_weight_power=cfg.MODEL.ROI_BOX_HEAD.FED_LOSS_FREQ_WEIGHT_POWER),  # noqa
-            "fed_loss_num_classes"      : cfg.MODEL.ROI_BOX_HEAD.FED_LOSS_NUM_CLASSES,
-            # fmt: on
-        }
-    def forward(self, x):
-        """
-        Args:
-            x: per-region features of shape (N, ...) for N bounding boxes to predict.
-        Returns:
-            (Tensor, Tensor):
-            First tensor: shape (N,K+1), scores for each of the N box. Each row contains the
-            scores for K object categories and 1 background class.
-            Second tensor: bounding box regression deltas for each box. Shape is shape (N,Kx4),
-            or (N,4) for class-agnostic regression.
-        """
-        if x.dim() > 2:
-            x = torch.flatten(x, start_dim=1)
-        scores = self.cls_score(x)
-        proposal_deltas = self.bbox_pred(x)
-        return scores, proposal_deltas
-    def losses(self, predictions, proposals, weights=None):
-        """
-        Args:
-            predictions: return values of :meth:`forward()`.
-            proposals (list[Instances]): proposals that match the features that were used
-                to compute predictions. The fields ``proposal_boxes``, ``gt_boxes``,
-                ``gt_classes`` are expected.
-            weights: weights for reweighting the loss of each instance based on IoU
-        Returns:
-            Dict[str, Tensor]: dict of losses
-        """
-        scores, proposal_deltas = predictions
-        # parse classification outputs
-        gt_classes = (
-            cat([p.gt_classes for p in proposals], dim=0) if len(proposals) else torch.empty(0)
-        )
-        _log_classification_stats(scores, gt_classes)
-        # parse box regression outputs
-        if len(proposals):
-            proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0)  # Nx4
-            assert not proposal_boxes.requires_grad, "Proposals should not require gradients!"
-            # If "gt_boxes" does not exist, the proposals must be all negative and
-            # should not be included in regression loss computation.
-            # Here we just use proposal_boxes as an arbitrary placeholder because its
-            # value won't be used in self.box_reg_loss().
-            gt_boxes = cat(
-                [(p.gt_boxes if p.has("gt_boxes") else p.proposal_boxes).tensor for p in proposals],
-                dim=0,
-            )
-        else:
-            proposal_boxes = gt_boxes = torch.empty((0, 4), device=proposal_deltas.device)
-        if self.use_sigmoid_ce:
-            loss_cls = self.sigmoid_cross_entropy_loss(scores, gt_classes)
-        else:
-            if weights != None:
-                loss_cls = (weights * cross_entropy(scores, gt_classes, reduction='none')).mean()
-            else:
-                loss_cls = cross_entropy(scores, gt_classes, reduction="mean")
-        losses = {
-            "loss_cls": loss_cls,
-            "loss_box_reg": self.box_reg_loss(
-                proposal_boxes, gt_boxes, proposal_deltas, gt_classes
-            ),
-        }
-        return {k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items()}
-    # Implementation from https://github.com/xingyizhou/CenterNet2/blob/master/projects/CenterNet2/centernet/modeling/roi_heads/fed_loss.py  # noqa
-    # with slight modifications
-    def get_fed_loss_classes(self, gt_classes, num_fed_loss_classes, num_classes, weight):
-        """
-        Args:
-            gt_classes: a long tensor of shape R that contains the gt class label of each proposal.
-            num_fed_loss_classes: minimum number of classes to keep when calculating federated loss.
-            Will sample negative classes if number of unique gt_classes is smaller than this value.
-            num_classes: number of foreground classes
-            weight: probabilities used to sample negative classes
-        Returns:
-            Tensor:
-                classes to keep when calculating the federated loss, including both unique gt
-                classes and sampled negative classes.
-        """
-        unique_gt_classes = torch.unique(gt_classes)
-        prob = unique_gt_classes.new_ones(num_classes + 1).float()
-        prob[-1] = 0
-        if len(unique_gt_classes) < num_fed_loss_classes:
-            prob[:num_classes] = weight.float().clone()
-            prob[unique_gt_classes] = 0
-            sampled_negative_classes = torch.multinomial(
-                prob, num_fed_loss_classes - len(unique_gt_classes), replacement=False
-            )
-            fed_loss_classes = torch.cat([unique_gt_classes, sampled_negative_classes])
-        else:
-            fed_loss_classes = unique_gt_classes
-        return fed_loss_classes
-    # Implementation from https://github.com/xingyizhou/CenterNet2/blob/master/projects/CenterNet2/centernet/modeling/roi_heads/custom_fast_rcnn.py#L113  # noqa
-    # with slight modifications
-    def sigmoid_cross_entropy_loss(self, pred_class_logits, gt_classes):
-        """
-        Args:
-            pred_class_logits: shape (N, K+1), scores for each of the N box. Each row contains the
-            scores for K object categories and 1 background class
-            gt_classes: a long tensor of shape R that contains the gt class label of each proposal.
-        """
-        if pred_class_logits.numel() == 0:
-            return pred_class_logits.new_zeros([1])[0]
-        N = pred_class_logits.shape[0]
-        K = pred_class_logits.shape[1] - 1
-        target = pred_class_logits.new_zeros(N, K + 1)
-        target[range(len(gt_classes)), gt_classes] = 1
-        target = target[:, :K]
-        cls_loss = F.binary_cross_entropy_with_logits(
-            pred_class_logits[:, :-1], target, reduction="none"
-        )
-        if self.use_fed_loss:
-            fed_loss_classes = self.get_fed_loss_classes(
-                gt_classes,
-                num_fed_loss_classes=self.fed_loss_num_classes,
-                num_classes=K,
-                weight=self.fed_loss_cls_weights,
-            )
-            fed_loss_classes_mask = fed_loss_classes.new_zeros(K + 1)
-            fed_loss_classes_mask[fed_loss_classes] = 1
-            fed_loss_classes_mask = fed_loss_classes_mask[:K]
-            weight = fed_loss_classes_mask.view(1, K).expand(N, K).float()
-        else:
-            weight = 1
-        loss = torch.sum(cls_loss * weight) / N
-        return loss
-    def box_reg_loss(self, proposal_boxes, gt_boxes, pred_deltas, gt_classes):
-        """
-        Args:
-            proposal_boxes/gt_boxes are tensors with the same shape (R, 4 or 5).
-            pred_deltas has shape (R, 4 or 5), or (R, num_classes * (4 or 5)).
-            gt_classes is a long tensor of shape R, the gt class label of each proposal.
-            R shall be the number of proposals.
-        """
-        box_dim = proposal_boxes.shape[1]  # 4 or 5
-        # Regression loss is only computed for foreground proposals (those matched to a GT)
-        fg_inds = nonzero_tuple((gt_classes >= 0) & (gt_classes < self.num_classes))[0]
-        if pred_deltas.shape[1] == box_dim:  # cls-agnostic regression
-            fg_pred_deltas = pred_deltas[fg_inds]
-        else:
-            fg_pred_deltas = pred_deltas.view(-1, self.num_classes, box_dim)[
-                fg_inds, gt_classes[fg_inds]
-            ]
-        loss_box_reg = _dense_box_regression_loss(
-            [proposal_boxes[fg_inds]],
-            self.box2box_transform,
-            [fg_pred_deltas.unsqueeze(0)],
-            [gt_boxes[fg_inds]],
-            ...,
-            self.box_reg_loss_type,
-            self.smooth_l1_beta,
-        )
-        # The reg loss is normalized using the total number of regions (R), not the number
-        # of foreground regions even though the box regression loss is only defined on
-        # foreground regions. Why? Because doing so gives equal training influence to
-        # each foreground example. To see how, consider two different minibatches:
-        #  (1) Contains a single foreground region
-        #  (2) Contains 100 foreground regions
-        # If we normalize by the number of foreground regions, the single example in
-        # minibatch (1) will be given 100 times as much influence as each foreground
-        # example in minibatch (2). Normalizing by the total number of regions, R,
-        # means that the single example in minibatch (1) and each of the 100 examples
-        # in minibatch (2) are given equal influence.
-        return loss_box_reg / max(gt_classes.numel(), 1.0)  # return 0 if empty
-    def inference(self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]):
-        """
-        Args:
-            predictions: return values of :meth:`forward()`.
-            proposals (list[Instances]): proposals that match the features that were
-                used to compute predictions. The ``proposal_boxes`` field is expected.
-        Returns:
-            list[Instances]: same as `fast_rcnn_inference`.
-            list[Tensor]: same as `fast_rcnn_inference`.
-        """
-        boxes = self.predict_boxes(predictions, proposals)
-        scores = self.predict_probs(predictions, proposals)
-        image_shapes = [x.image_size for x in proposals]
-        return fast_rcnn_inference(
-            boxes,
-            scores,
-            image_shapes,
-            self.test_score_thresh,
-            self.test_nms_thresh,
-            self.test_topk_per_image,
-        )
-    def predict_boxes_for_gt_classes(self, predictions, proposals):
-        """
-        Args:
-            predictions: return values of :meth:`forward()`.
-            proposals (list[Instances]): proposals that match the features that were used
-                to compute predictions. The fields ``proposal_boxes``, ``gt_classes`` are expected.
-        Returns:
-            list[Tensor]:
-                A list of Tensors of predicted boxes for GT classes in case of
-                class-specific box head. Element i of the list has shape (Ri, B), where Ri is
-                the number of proposals for image i and B is the box dimension (4 or 5)
-        """
-        if not len(proposals):
-            return []
-        scores, proposal_deltas = predictions
-        proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0)
-        N, B = proposal_boxes.shape
-        predict_boxes = self.box2box_transform.apply_deltas(
-            proposal_deltas, proposal_boxes
-        )  # Nx(KxB)
-        K = predict_boxes.shape[1] // B
-        if K > 1:
-            gt_classes = torch.cat([p.gt_classes for p in proposals], dim=0)
-            # Some proposals are ignored or have a background class. Their gt_classes
-            # cannot be used as index.
-            gt_classes = gt_classes.clamp_(0, K - 1)
-            predict_boxes = predict_boxes.view(N, K, B)[
-                torch.arange(N, dtype=torch.long, device=predict_boxes.device), gt_classes
-            ]
-        num_prop_per_image = [len(p) for p in proposals]
-        return predict_boxes.split(num_prop_per_image)
-    def predict_boxes(
-        self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]
-    ):
-        """
-        Args:
-            predictions: return values of :meth:`forward()`.
-            proposals (list[Instances]): proposals that match the features that were
-                used to compute predictions. The ``proposal_boxes`` field is expected.
-        Returns:
-            list[Tensor]:
-                A list of Tensors of predicted class-specific or class-agnostic boxes
-                for each image. Element i has shape (Ri, K * B) or (Ri, B), where Ri is
-                the number of proposals for image i and B is the box dimension (4 or 5)
-        """
-        if not len(proposals):
-            return []
-        _, proposal_deltas = predictions
-        num_prop_per_image = [len(p) for p in proposals]
-        proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0)
-        predict_boxes = self.box2box_transform.apply_deltas(
-            proposal_deltas,
-            proposal_boxes,
-        )  # Nx(KxB)
-        return predict_boxes.split(num_prop_per_image)
-    def predict_probs(
-        self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]
-    ):
-        """
-        Args:
-            predictions: return values of :meth:`forward()`.
-            proposals (list[Instances]): proposals that match the features that were
-                used to compute predictions.
-        Returns:
-            list[Tensor]:
-                A list of Tensors of predicted class probabilities for each image.
-                Element i has shape (Ri, K + 1), where Ri is the number of proposals for image i.
-        """
-        scores, _ = predictions
-        num_inst_per_image = [len(p) for p in proposals]
-        if self.use_sigmoid_ce:
-            probs = scores.sigmoid()
-        else:
-            probs = F.softmax(scores, dim=-1)
-        return probs.split(num_inst_per_image, dim=0)

cutler/modeling/roi_heads/roi_heads.py DELETED Viewed

@@ -1,926 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# Modified by XuDong Wang from https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/roi_heads/roi_heads.py
-import inspect
-import logging
-import numpy as np
-from typing import Dict, List, Optional, Tuple
-import torch
-from torch import nn
-from detectron2.config import configurable
-from detectron2.layers import ShapeSpec, nonzero_tuple
-from detectron2.structures import Boxes, pairwise_iou
-from structures import pairwise_iou_max_scores
-from detectron2.structures import ImageList, Instances
-from detectron2.utils.events import get_event_storage
-from detectron2.utils.registry import Registry
-from detectron2.modeling.backbone.resnet import BottleneckBlock, ResNet
-from detectron2.modeling.matcher import Matcher
-from detectron2.modeling.poolers import ROIPooler
-from detectron2.modeling.proposal_generator.proposal_utils import add_ground_truth_to_proposals
-from detectron2.modeling.sampling import subsample_labels
-from detectron2.modeling.roi_heads.box_head import build_box_head
-from .fast_rcnn import FastRCNNOutputLayers
-from detectron2.modeling.roi_heads.keypoint_head import build_keypoint_head
-from detectron2.modeling.roi_heads.mask_head import build_mask_head
-from detectron2.modeling.box_regression import Box2BoxTransform
-import torch.nn.functional as F
-from colored import fg
-blue, red = fg('blue'), fg('red')
-ROI_HEADS_REGISTRY = Registry("ROI_HEADS")
-ROI_HEADS_REGISTRY.__doc__ = """
-Registry for ROI heads in a generalized R-CNN model.
-ROIHeads take feature maps and region proposals, and
-perform per-region computation.
-The registered object will be called with `obj(cfg, input_shape)`.
-The call is expected to return an :class:`ROIHeads`.
-"""
-logger = logging.getLogger(__name__)
-def build_roi_heads(cfg, input_shape):
-    """
-    Build ROIHeads defined by `cfg.MODEL.ROI_HEADS.NAME`.
-    """
-    name = cfg.MODEL.ROI_HEADS.NAME
-    return ROI_HEADS_REGISTRY.get(name)(cfg, input_shape)
-def select_foreground_proposals(
-    proposals: List[Instances], bg_label: int
-) -> Tuple[List[Instances], List[torch.Tensor]]:
-    """
-    Given a list of N Instances (for N images), each containing a `gt_classes` field,
-    return a list of Instances that contain only instances with `gt_classes != -1 &&
-    gt_classes != bg_label`.
-    Args:
-        proposals (list[Instances]): A list of N Instances, where N is the number of
-            images in the batch.
-        bg_label: label index of background class.
-    Returns:
-        list[Instances]: N Instances, each contains only the selected foreground instances.
-        list[Tensor]: N boolean vector, correspond to the selection mask of
-            each Instances object. True for selected instances.
-    """
-    assert isinstance(proposals, (list, tuple))
-    assert isinstance(proposals[0], Instances)
-    assert proposals[0].has("gt_classes")
-    fg_proposals = []
-    fg_selection_masks = []
-    for proposals_per_image in proposals:
-        gt_classes = proposals_per_image.gt_classes
-        fg_selection_mask = (gt_classes != -1) & (gt_classes != bg_label)
-        fg_idxs = fg_selection_mask.nonzero().squeeze(1)
-        fg_proposals.append(proposals_per_image[fg_idxs])
-        fg_selection_masks.append(fg_selection_mask)
-    return fg_proposals, fg_selection_masks
-def select_proposals_with_visible_keypoints(proposals: List[Instances]) -> List[Instances]:
-    """
-    Args:
-        proposals (list[Instances]): a list of N Instances, where N is the
-            number of images.
-    Returns:
-        proposals: only contains proposals with at least one visible keypoint.
-    Note that this is still slightly different from Detectron.
-    In Detectron, proposals for training keypoint head are re-sampled from
-    all the proposals with IOU>threshold & >=1 visible keypoint.
-    Here, the proposals are first sampled from all proposals with
-    IOU>threshold, then proposals with no visible keypoint are filtered out.
-    This strategy seems to make no difference on Detectron and is easier to implement.
-    """
-    ret = []
-    all_num_fg = []
-    for proposals_per_image in proposals:
-        # If empty/unannotated image (hard negatives), skip filtering for train
-        if len(proposals_per_image) == 0:
-            ret.append(proposals_per_image)
-            continue
-        gt_keypoints = proposals_per_image.gt_keypoints.tensor
-        # #fg x K x 3
-        vis_mask = gt_keypoints[:, :, 2] >= 1
-        xs, ys = gt_keypoints[:, :, 0], gt_keypoints[:, :, 1]
-        proposal_boxes = proposals_per_image.proposal_boxes.tensor.unsqueeze(dim=1)  # #fg x 1 x 4
-        kp_in_box = (
-            (xs >= proposal_boxes[:, :, 0])
-            & (xs <= proposal_boxes[:, :, 2])
-            & (ys >= proposal_boxes[:, :, 1])
-            & (ys <= proposal_boxes[:, :, 3])
-        )
-        selection = (kp_in_box & vis_mask).any(dim=1)
-        selection_idxs = nonzero_tuple(selection)[0]
-        all_num_fg.append(selection_idxs.numel())
-        ret.append(proposals_per_image[selection_idxs])
-    storage = get_event_storage()
-    storage.put_scalar("keypoint_head/num_fg_samples", np.mean(all_num_fg))
-    return ret
-class ROIHeads(torch.nn.Module):
-    """
-    ROIHeads perform all per-region computation in an R-CNN.
-    It typically contains logic to
-    1. (in training only) match proposals with ground truth and sample them
-    2. crop the regions and extract per-region features using proposals
-    3. make per-region predictions with different heads
-    It can have many variants, implemented as subclasses of this class.
-    This base class contains the logic to match/sample proposals.
-    But it is not necessary to inherit this class if the sampling logic is not needed.
-    """
-    @configurable
-    def __init__(
-        self,
-        *,
-        num_classes,
-        batch_size_per_image,
-        positive_fraction,
-        proposal_matcher,
-        proposal_append_gt=True,
-    ):
-        """
-        NOTE: this interface is experimental.
-        Args:
-            num_classes (int): number of foreground classes (i.e. background is not included)
-            batch_size_per_image (int): number of proposals to sample for training
-            positive_fraction (float): fraction of positive (foreground) proposals
-                to sample for training.
-            proposal_matcher (Matcher): matcher that matches proposals and ground truth
-            proposal_append_gt (bool): whether to include ground truth as proposals as well
-        """
-        super().__init__()
-        self.batch_size_per_image = batch_size_per_image
-        self.positive_fraction = positive_fraction
-        self.num_classes = num_classes
-        self.proposal_matcher = proposal_matcher
-        self.proposal_append_gt = proposal_append_gt
-    @classmethod
-    def from_config(cls, cfg):
-        return {
-            "batch_size_per_image": cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE,
-            "positive_fraction": cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION,
-            "num_classes": cfg.MODEL.ROI_HEADS.NUM_CLASSES,
-            "proposal_append_gt": cfg.MODEL.ROI_HEADS.PROPOSAL_APPEND_GT,
-            # Matcher to assign box proposals to gt boxes
-            "proposal_matcher": Matcher(
-                cfg.MODEL.ROI_HEADS.IOU_THRESHOLDS,
-                cfg.MODEL.ROI_HEADS.IOU_LABELS,
-                allow_low_quality_matches=False,
-            ),
-        }
-    def _sample_proposals(
-        self, matched_idxs: torch.Tensor, matched_labels: torch.Tensor, gt_classes: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Based on the matching between N proposals and M groundtruth,
-        sample the proposals and set their classification labels.
-        Args:
-            matched_idxs (Tensor): a vector of length N, each is the best-matched
-                gt index in [0, M) for each proposal.
-            matched_labels (Tensor): a vector of length N, the matcher's label
-                (one of cfg.MODEL.ROI_HEADS.IOU_LABELS) for each proposal.
-            gt_classes (Tensor): a vector of length M.
-        Returns:
-            Tensor: a vector of indices of sampled proposals. Each is in [0, N).
-            Tensor: a vector of the same length, the classification label for
-                each sampled proposal. Each sample is labeled as either a category in
-                [0, num_classes) or the background (num_classes).
-        """
-        has_gt = gt_classes.numel() > 0
-        # Get the corresponding GT for each proposal
-        if has_gt:
-            gt_classes = gt_classes[matched_idxs]
-            # Label unmatched proposals (0 label from matcher) as background (label=num_classes)
-            gt_classes[matched_labels == 0] = self.num_classes
-            # Label ignore proposals (-1 label)
-            gt_classes[matched_labels == -1] = -1
-        else:
-            gt_classes = torch.zeros_like(matched_idxs) + self.num_classes
-        sampled_fg_idxs, sampled_bg_idxs = subsample_labels(
-            gt_classes, self.batch_size_per_image, self.positive_fraction, self.num_classes
-        )
-        sampled_idxs = torch.cat([sampled_fg_idxs, sampled_bg_idxs], dim=0)
-        return sampled_idxs, gt_classes[sampled_idxs]
-    @torch.no_grad()
-    def label_and_sample_proposals(
-        self, proposals: List[Instances], targets: List[Instances]
-    ) -> List[Instances]:
-        """
-        Prepare some proposals to be used to train the ROI heads.
-        It performs box matching between `proposals` and `targets`, and assigns
-        training labels to the proposals.
-        It returns ``self.batch_size_per_image`` random samples from proposals and groundtruth
-        boxes, with a fraction of positives that is no larger than
-        ``self.positive_fraction``.
-        Args:
-            See :meth:`ROIHeads.forward`
-        Returns:
-            list[Instances]:
-                length `N` list of `Instances`s containing the proposals
-                sampled for training. Each `Instances` has the following fields:
-                - proposal_boxes: the proposal boxes
-                - gt_boxes: the ground-truth box that the proposal is assigned to
-                  (this is only meaningful if the proposal has a label > 0; if label = 0
-                  then the ground-truth box is random)
-                Other fields such as "gt_classes", "gt_masks", that's included in `targets`.
-        """
-        # Augment proposals with ground-truth boxes.
-        # In the case of learned proposals (e.g., RPN), when training starts
-        # the proposals will be low quality due to random initialization.
-        # It's possible that none of these initial
-        # proposals have high enough overlap with the gt objects to be used
-        # as positive examples for the second stage components (box head,
-        # cls head, mask head). Adding the gt boxes to the set of proposals
-        # ensures that the second stage components will have some positive
-        # examples from the start of training. For RPN, this augmentation improves
-        # convergence and empirically improves box AP on COCO by about 0.5
-        # points (under one tested configuration).
-        if self.proposal_append_gt:
-            proposals = add_ground_truth_to_proposals(targets, proposals)
-        proposals_with_gt = []
-        num_fg_samples = []
-        num_bg_samples = []
-        for proposals_per_image, targets_per_image in zip(proposals, targets):
-            has_gt = len(targets_per_image) > 0
-            match_quality_matrix = pairwise_iou(
-                targets_per_image.gt_boxes, proposals_per_image.proposal_boxes
-            )
-            matched_idxs, matched_labels = self.proposal_matcher(match_quality_matrix)
-            sampled_idxs, gt_classes = self._sample_proposals(
-                matched_idxs, matched_labels, targets_per_image.gt_classes
-            )
-            # Set target attributes of the sampled proposals:
-            proposals_per_image = proposals_per_image[sampled_idxs]
-            proposals_per_image.gt_classes = gt_classes
-            if has_gt:
-                sampled_targets = matched_idxs[sampled_idxs]
-                # We index all the attributes of targets that start with "gt_"
-                # and have not been added to proposals yet (="gt_classes").
-                # NOTE: here the indexing waste some compute, because heads
-                # like masks, keypoints, etc, will filter the proposals again,
-                # (by foreground/background, or number of keypoints in the image, etc)
-                # so we essentially index the data twice.
-                for (trg_name, trg_value) in targets_per_image.get_fields().items():
-                    if trg_name.startswith("gt_") and not proposals_per_image.has(trg_name):
-                        proposals_per_image.set(trg_name, trg_value[sampled_targets])
-            # If no GT is given in the image, we don't know what a dummy gt value can be.
-            # Therefore the returned proposals won't have any gt_* fields, except for a
-            # gt_classes full of background label.
-            num_bg_samples.append((gt_classes == self.num_classes).sum().item())
-            num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1])
-            proposals_with_gt.append(proposals_per_image)
-        # Log the number of fg/bg samples that are selected for training ROI heads
-        storage = get_event_storage()
-        storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples))
-        storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples))
-        return proposals_with_gt
-    def forward(
-        self,
-        images: ImageList,
-        features: Dict[str, torch.Tensor],
-        proposals: List[Instances],
-        targets: Optional[List[Instances]] = None,
-    ) -> Tuple[List[Instances], Dict[str, torch.Tensor]]:
-        """
-        Args:
-            images (ImageList):
-            features (dict[str,Tensor]): input data as a mapping from feature
-                map name to tensor. Axis 0 represents the number of images `N` in
-                the input data; axes 1-3 are channels, height, and width, which may
-                vary between feature maps (e.g., if a feature pyramid is used).
-            proposals (list[Instances]): length `N` list of `Instances`. The i-th
-                `Instances` contains object proposals for the i-th input image,
-                with fields "proposal_boxes" and "objectness_logits".
-            targets (list[Instances], optional): length `N` list of `Instances`. The i-th
-                `Instances` contains the ground-truth per-instance annotations
-                for the i-th input image.  Specify `targets` during training only.
-                It may have the following fields:
-                - gt_boxes: the bounding box of each instance.
-                - gt_classes: the label for each instance with a category ranging in [0, #class].
-                - gt_masks: PolygonMasks or BitMasks, the ground-truth masks of each instance.
-                - gt_keypoints: NxKx3, the groud-truth keypoints for each instance.
-        Returns:
-            list[Instances]: length `N` list of `Instances` containing the
-            detected instances. Returned during inference only; may be [] during training.
-            dict[str->Tensor]:
-            mapping from a named loss to a tensor storing the loss. Used during training only.
-        """
-        raise NotImplementedError()
-@ROI_HEADS_REGISTRY.register()
-class Res5ROIHeads(ROIHeads):
-    """
-    The ROIHeads in a typical "C4" R-CNN model, where
-    the box and mask head share the cropping and
-    the per-region feature computation by a Res5 block.
-    See :paper:`ResNet` Appendix A.
-    """
-    @configurable
-    def __init__(
-        self,
-        *,
-        in_features: List[str],
-        pooler: ROIPooler,
-        res5: nn.Module,
-        box_predictor: nn.Module,
-        mask_head: Optional[nn.Module] = None,
-        **kwargs,
-    ):
-        """
-        NOTE: this interface is experimental.
-        Args:
-            in_features (list[str]): list of backbone feature map names to use for
-                feature extraction
-            pooler (ROIPooler): pooler to extra region features from backbone
-            res5 (nn.Sequential): a CNN to compute per-region features, to be used by
-                ``box_predictor`` and ``mask_head``. Typically this is a "res5"
-                block from a ResNet.
-            box_predictor (nn.Module): make box predictions from the feature.
-                Should have the same interface as :class:`FastRCNNOutputLayers`.
-            mask_head (nn.Module): transform features to make mask predictions
-        """
-        super().__init__(**kwargs)
-        self.in_features = in_features
-        self.pooler = pooler
-        if isinstance(res5, (list, tuple)):
-            res5 = nn.Sequential(*res5)
-        self.res5 = res5
-        self.box_predictor = box_predictor
-        self.mask_on = mask_head is not None
-        if self.mask_on:
-            self.mask_head = mask_head
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        # fmt: off
-        ret = super().from_config(cfg)
-        in_features = ret["in_features"] = cfg.MODEL.ROI_HEADS.IN_FEATURES
-        pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
-        pooler_type       = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
-        pooler_scales     = (1.0 / input_shape[in_features[0]].stride, )
-        sampling_ratio    = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
-        mask_on           = cfg.MODEL.MASK_ON
-        # fmt: on
-        assert not cfg.MODEL.KEYPOINT_ON
-        assert len(in_features) == 1
-        ret["pooler"] = ROIPooler(
-            output_size=pooler_resolution,
-            scales=pooler_scales,
-            sampling_ratio=sampling_ratio,
-            pooler_type=pooler_type,
-        )
-        # Compatbility with old moco code. Might be useful.
-        # See notes in StandardROIHeads.from_config
-        if not inspect.ismethod(cls._build_res5_block):
-            logger.warning(
-                "The behavior of _build_res5_block may change. "
-                "Please do not depend on private methods."
-            )
-            cls._build_res5_block = classmethod(cls._build_res5_block)
-        ret["res5"], out_channels = cls._build_res5_block(cfg)
-        ret["box_predictor"] = FastRCNNOutputLayers(
-            cfg, ShapeSpec(channels=out_channels, height=1, width=1)
-        )
-        if mask_on:
-            ret["mask_head"] = build_mask_head(
-                cfg,
-                ShapeSpec(channels=out_channels, width=pooler_resolution, height=pooler_resolution),
-            )
-        return ret
-    @classmethod
-    def _build_res5_block(cls, cfg):
-        # fmt: off
-        stage_channel_factor = 2 ** 3  # res5 is 8x res2
-        num_groups           = cfg.MODEL.RESNETS.NUM_GROUPS
-        width_per_group      = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
-        bottleneck_channels  = num_groups * width_per_group * stage_channel_factor
-        out_channels         = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS * stage_channel_factor
-        stride_in_1x1        = cfg.MODEL.RESNETS.STRIDE_IN_1X1
-        norm                 = cfg.MODEL.RESNETS.NORM
-        assert not cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE[-1], \
-            "Deformable conv is not yet supported in res5 head."
-        # fmt: on
-        blocks = ResNet.make_stage(
-            BottleneckBlock,
-            3,
-            stride_per_block=[2, 1, 1],
-            in_channels=out_channels // 2,
-            bottleneck_channels=bottleneck_channels,
-            out_channels=out_channels,
-            num_groups=num_groups,
-            norm=norm,
-            stride_in_1x1=stride_in_1x1,
-        )
-        return nn.Sequential(*blocks), out_channels
-    def _shared_roi_transform(self, features: List[torch.Tensor], boxes: List[Boxes]):
-        x = self.pooler(features, boxes)
-        return self.res5(x)
-    def forward(
-        self,
-        images: ImageList,
-        features: Dict[str, torch.Tensor],
-        proposals: List[Instances],
-        targets: Optional[List[Instances]] = None,
-    ):
-        """
-        See :meth:`ROIHeads.forward`.
-        """
-        del images
-        if self.training:
-            assert targets
-            proposals = self.label_and_sample_proposals(proposals, targets)
-        del targets
-        proposal_boxes = [x.proposal_boxes for x in proposals]
-        box_features = self._shared_roi_transform(
-            [features[f] for f in self.in_features], proposal_boxes
-        )
-        predictions = self.box_predictor(box_features.mean(dim=[2, 3]))
-        if self.training:
-            del features
-            losses = self.box_predictor.losses(predictions, proposals)
-            if self.mask_on:
-                proposals, fg_selection_masks = select_foreground_proposals(
-                    proposals, self.num_classes
-                )
-                # Since the ROI feature transform is shared between boxes and masks,
-                # we don't need to recompute features. The mask loss is only defined
-                # on foreground proposals, so we need to select out the foreground
-                # features.
-                mask_features = box_features[torch.cat(fg_selection_masks, dim=0)]
-                del box_features
-                losses.update(self.mask_head(mask_features, proposals))
-            return [], losses
-        else:
-            pred_instances, _ = self.box_predictor.inference(predictions, proposals)
-            pred_instances = self.forward_with_given_boxes(features, pred_instances)
-            return pred_instances, {}
-    def forward_with_given_boxes(
-        self, features: Dict[str, torch.Tensor], instances: List[Instances]
-    ) -> List[Instances]:
-        """
-        Use the given boxes in `instances` to produce other (non-box) per-ROI outputs.
-        Args:
-            features: same as in `forward()`
-            instances (list[Instances]): instances to predict other outputs. Expect the keys
-                "pred_boxes" and "pred_classes" to exist.
-        Returns:
-            instances (Instances):
-                the same `Instances` object, with extra
-                fields such as `pred_masks` or `pred_keypoints`.
-        """
-        assert not self.training
-        assert instances[0].has("pred_boxes") and instances[0].has("pred_classes")
-        if self.mask_on:
-            feature_list = [features[f] for f in self.in_features]
-            x = self._shared_roi_transform(feature_list, [x.pred_boxes for x in instances])
-            return self.mask_head(x, instances)
-        else:
-            return instances
-@ROI_HEADS_REGISTRY.register()
-class CustomStandardROIHeads(ROIHeads):
-    """
-    It's "standard" in a sense that there is no ROI transform sharing
-    or feature sharing between tasks.
-    Each head independently processes the input features by each head's
-    own pooler and head.
-    This class is used by most models, such as FPN and C5.
-    To implement more models, you can subclass it and implement a different
-    :meth:`forward()` or a head.
-    """
-    @configurable
-    def __init__(
-        self,
-        *,
-        box_in_features: List[str],
-        box_pooler: ROIPooler,
-        box_head: nn.Module,
-        box_predictor: nn.Module,
-        mask_in_features: Optional[List[str]] = None,
-        mask_pooler: Optional[ROIPooler] = None,
-        mask_head: Optional[nn.Module] = None,
-        keypoint_in_features: Optional[List[str]] = None,
-        keypoint_pooler: Optional[ROIPooler] = None,
-        keypoint_head: Optional[nn.Module] = None,
-        train_on_pred_boxes: bool = False,
-        box2box_transform = Box2BoxTransform,
-        use_droploss: bool = False,
-        droploss_iou_thresh: float = 1.0,
-        **kwargs,
-    ):
-        """
-        NOTE: this interface is experimental.
-        Args:
-            box_in_features (list[str]): list of feature names to use for the box head.
-            box_pooler (ROIPooler): pooler to extra region features for box head
-            box_head (nn.Module): transform features to make box predictions
-            box_predictor (nn.Module): make box predictions from the feature.
-                Should have the same interface as :class:`FastRCNNOutputLayers`.
-            mask_in_features (list[str]): list of feature names to use for the mask
-                pooler or mask head. None if not using mask head.
-            mask_pooler (ROIPooler): pooler to extract region features from image features.
-                The mask head will then take region features to make predictions.
-                If None, the mask head will directly take the dict of image features
-                defined by `mask_in_features`
-            mask_head (nn.Module): transform features to make mask predictions
-            keypoint_in_features, keypoint_pooler, keypoint_head: similar to ``mask_*``.
-            train_on_pred_boxes (bool): whether to use proposal boxes or
-                predicted boxes from the box head to train other heads.
-        """
-        super().__init__(**kwargs)
-        # keep self.in_features for backward compatibility
-        self.in_features = self.box_in_features = box_in_features
-        self.box_pooler = box_pooler
-        self.box_head = box_head
-        self.box_predictor = box_predictor
-        self.mask_on = mask_in_features is not None
-        if self.mask_on:
-            self.mask_in_features = mask_in_features
-            self.mask_pooler = mask_pooler
-            self.mask_head = mask_head
-        self.keypoint_on = keypoint_in_features is not None
-        if self.keypoint_on:
-            self.keypoint_in_features = keypoint_in_features
-            self.keypoint_pooler = keypoint_pooler
-            self.keypoint_head = keypoint_head
-        self.train_on_pred_boxes = train_on_pred_boxes
-        self.use_droploss = use_droploss
-        self.box2box_transform = box2box_transform
-        self.droploss_iou_thresh = droploss_iou_thresh
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        ret = super().from_config(cfg)
-        ret["train_on_pred_boxes"] = cfg.MODEL.ROI_BOX_HEAD.TRAIN_ON_PRED_BOXES
-        # Subclasses that have not been updated to use from_config style construction
-        # may have overridden _init_*_head methods. In this case, those overridden methods
-        # will not be classmethods and we need to avoid trying to call them here.
-        # We test for this with ismethod which only returns True for bound methods of cls.
-        # Such subclasses will need to handle calling their overridden _init_*_head methods.
-        if cfg.MODEL.ROI_HEADS.USE_DROPLOSS:
-            ret['use_droploss'] = True
-            ret['droploss_iou_thresh'] = cfg.MODEL.ROI_HEADS.DROPLOSS_IOU_THRESH
-            ret['box2box_transform'] = Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS)
-        if inspect.ismethod(cls._init_box_head):
-            ret.update(cls._init_box_head(cfg, input_shape))
-        if inspect.ismethod(cls._init_mask_head):
-            ret.update(cls._init_mask_head(cfg, input_shape))
-        if inspect.ismethod(cls._init_keypoint_head):
-            ret.update(cls._init_keypoint_head(cfg, input_shape))
-        return ret
-    @classmethod
-    def _init_box_head(cls, cfg, input_shape):
-        # fmt: off
-        in_features       = cfg.MODEL.ROI_HEADS.IN_FEATURES
-        pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
-        pooler_scales     = tuple(1.0 / input_shape[k].stride for k in in_features)
-        sampling_ratio    = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
-        pooler_type       = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
-        # fmt: on
-        # If CustomStandardROIHeads is applied on multiple feature maps (as in FPN),
-        # then we share the same predictors and therefore the channel counts must be the same
-        in_channels = [input_shape[f].channels for f in in_features]
-        # Check all channel counts are equal
-        assert len(set(in_channels)) == 1, in_channels
-        in_channels = in_channels[0]
-        box_pooler = ROIPooler(
-            output_size=pooler_resolution,
-            scales=pooler_scales,
-            sampling_ratio=sampling_ratio,
-            pooler_type=pooler_type,
-        )
-        # Here we split "box head" and "box predictor", which is mainly due to historical reasons.
-        # They are used together so the "box predictor" layers should be part of the "box head".
-        # New subclasses of ROIHeads do not need "box predictor"s.
-        box_head = build_box_head(
-            cfg, ShapeSpec(channels=in_channels, height=pooler_resolution, width=pooler_resolution)
-        )
-        box_predictor = FastRCNNOutputLayers(cfg, box_head.output_shape)
-        return {
-            "box_in_features": in_features,
-            "box_pooler": box_pooler,
-            "box_head": box_head,
-            "box_predictor": box_predictor,
-        }
-    @classmethod
-    def _init_mask_head(cls, cfg, input_shape):
-        if not cfg.MODEL.MASK_ON:
-            return {}
-        # fmt: off
-        in_features       = cfg.MODEL.ROI_HEADS.IN_FEATURES
-        pooler_resolution = cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION
-        pooler_scales     = tuple(1.0 / input_shape[k].stride for k in in_features)
-        sampling_ratio    = cfg.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO
-        pooler_type       = cfg.MODEL.ROI_MASK_HEAD.POOLER_TYPE
-        # fmt: on
-        in_channels = [input_shape[f].channels for f in in_features][0]
-        ret = {"mask_in_features": in_features}
-        ret["mask_pooler"] = (
-            ROIPooler(
-                output_size=pooler_resolution,
-                scales=pooler_scales,
-                sampling_ratio=sampling_ratio,
-                pooler_type=pooler_type,
-            )
-            if pooler_type
-            else None
-        )
-        if pooler_type:
-            shape = ShapeSpec(
-                channels=in_channels, width=pooler_resolution, height=pooler_resolution
-            )
-        else:
-            shape = {f: input_shape[f] for f in in_features}
-        ret["mask_head"] = build_mask_head(cfg, shape)
-        return ret
-    @classmethod
-    def _init_keypoint_head(cls, cfg, input_shape):
-        if not cfg.MODEL.KEYPOINT_ON:
-            return {}
-        # fmt: off
-        in_features       = cfg.MODEL.ROI_HEADS.IN_FEATURES
-        pooler_resolution = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION
-        pooler_scales     = tuple(1.0 / input_shape[k].stride for k in in_features)  # noqa
-        sampling_ratio    = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO
-        pooler_type       = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_TYPE
-        # fmt: on
-        in_channels = [input_shape[f].channels for f in in_features][0]
-        ret = {"keypoint_in_features": in_features}
-        ret["keypoint_pooler"] = (
-            ROIPooler(
-                output_size=pooler_resolution,
-                scales=pooler_scales,
-                sampling_ratio=sampling_ratio,
-                pooler_type=pooler_type,
-            )
-            if pooler_type
-            else None
-        )
-        if pooler_type:
-            shape = ShapeSpec(
-                channels=in_channels, width=pooler_resolution, height=pooler_resolution
-            )
-        else:
-            shape = {f: input_shape[f] for f in in_features}
-        ret["keypoint_head"] = build_keypoint_head(cfg, shape)
-        return ret
-    def forward(
-        self,
-        images: ImageList,
-        features: Dict[str, torch.Tensor],
-        proposals: List[Instances],
-        targets: Optional[List[Instances]] = None,
-    ) -> Tuple[List[Instances], Dict[str, torch.Tensor]]:
-        """
-        See :class:`ROIHeads.forward`.
-        """
-        del images
-        if self.training:
-            assert targets, "'targets' argument is required during training"
-            proposals = self.label_and_sample_proposals(proposals, targets)
-        del targets
-        if self.training:
-            losses = self._forward_box(features, proposals)
-            # Usually the original proposals used by the box head are used by the mask, keypoint
-            # heads. But when `self.train_on_pred_boxes is True`, proposals will contain boxes
-            # predicted by the box head.
-            losses.update(self._forward_mask(features, proposals))
-            losses.update(self._forward_keypoint(features, proposals))
-            return proposals, losses
-        else:
-            pred_instances = self._forward_box(features, proposals)
-            # During inference cascaded prediction is used: the mask and keypoints heads are only
-            # applied to the top scoring box detections.
-            pred_instances = self.forward_with_given_boxes(features, pred_instances)
-            return pred_instances, {}
-    def forward_with_given_boxes(
-        self, features: Dict[str, torch.Tensor], instances: List[Instances]
-    ) -> List[Instances]:
-        """
-        Use the given boxes in `instances` to produce other (non-box) per-ROI outputs.
-        This is useful for downstream tasks where a box is known, but need to obtain
-        other attributes (outputs of other heads).
-        Test-time augmentation also uses this.
-        Args:
-            features: same as in `forward()`
-            instances (list[Instances]): instances to predict other outputs. Expect the keys
-                "pred_boxes" and "pred_classes" to exist.
-        Returns:
-            list[Instances]:
-                the same `Instances` objects, with extra
-                fields such as `pred_masks` or `pred_keypoints`.
-        """
-        assert not self.training
-        assert instances[0].has("pred_boxes") and instances[0].has("pred_classes")
-        instances = self._forward_mask(features, instances)
-        instances = self._forward_keypoint(features, instances)
-        return instances
-    def _forward_box(self, features: Dict[str, torch.Tensor], proposals: List[Instances]):
-        """
-        Forward logic of the box prediction branch. If `self.train_on_pred_boxes is True`,
-            the function puts predicted boxes in the `proposal_boxes` field of `proposals` argument.
-        Args:
-            features (dict[str, Tensor]): mapping from feature map names to tensor.
-                Same as in :meth:`ROIHeads.forward`.
-            proposals (list[Instances]): the per-image object proposals with
-                their matching ground truth.
-                Each has fields "proposal_boxes", and "objectness_logits",
-                "gt_classes", "gt_boxes".
-        Returns:
-            In training, a dict of losses.
-            In inference, a list of `Instances`, the predicted instances.
-        """
-        features = [features[f] for f in self.box_in_features]
-        box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) # torch.Size([512 * batch_size, 256, 7, 7])
-        box_features = self.box_head(box_features) # torch.Size([512 * batch_size, 1024])
-        predictions = self.box_predictor(box_features) # [torch.Size([512 * batch_size, 2]), torch.Size([512 * batch_size, 4])]
-        no_gt_found = False
-        if self.use_droploss and self.training:
-            # the first K proposals are GT proposals
-            try:
-                box_num_list = [len(x.gt_boxes) for x in proposals]
-                gt_num_list = [torch.unique(x.gt_boxes.tensor[:100], dim=0).size()[0] for x in proposals]
-            except:
-                box_num_list = [0 for _ in proposals]
-                gt_num_list = [0 for _ in proposals]
-                no_gt_found = True
-        if self.use_droploss and self.training and not no_gt_found:
-            # NOTE: maximum overlapping with GT (IoU)
-            predictions_delta = predictions[1]
-            proposal_boxes = Boxes.cat([x.proposal_boxes for x in proposals])
-            predictions_bbox = self.box2box_transform.apply_deltas(predictions_delta, proposal_boxes.tensor)
-            idx_start = 0
-            iou_max_list = []
-            for idx, x in enumerate(proposals):
-                idx_end = idx_start + box_num_list[idx]
-                iou_max_list.append(pairwise_iou_max_scores(predictions_bbox[idx_start:idx_end], x.gt_boxes[:gt_num_list[idx]].tensor))
-                idx_start = idx_end
-            iou_max = torch.cat(iou_max_list, dim=0)
-        del box_features
-        if self.training:
-            if self.use_droploss and not no_gt_found:
-                weights = iou_max.le(self.droploss_iou_thresh).float()
-                weights = 1 - weights.ge(1.0).float()
-                losses = self.box_predictor.losses(predictions, proposals, weights=weights.detach())
-            else:
-                losses = self.box_predictor.losses(predictions, proposals)
-            if self.train_on_pred_boxes: # default is false
-                with torch.no_grad():
-                    pred_boxes = self.box_predictor.predict_boxes_for_gt_classes(
-                        predictions, proposals
-                    )
-                    for proposals_per_image, pred_boxes_per_image in zip(proposals, pred_boxes):
-                        proposals_per_image.proposal_boxes = Boxes(pred_boxes_per_image)
-            return losses
-        else:
-            pred_instances, _ = self.box_predictor.inference(predictions, proposals)
-            return pred_instances
-    def _forward_mask(self, features: Dict[str, torch.Tensor], instances: List[Instances]):
-        """
-        Forward logic of the mask prediction branch.
-        Args:
-            features (dict[str, Tensor]): mapping from feature map names to tensor.
-                Same as in :meth:`ROIHeads.forward`.
-            instances (list[Instances]): the per-image instances to train/predict masks.
-                In training, they can be the proposals.
-                In inference, they can be the boxes predicted by R-CNN box head.
-        Returns:
-            In training, a dict of losses.
-            In inference, update `instances` with new fields "pred_masks" and return it.
-        """
-        if not self.mask_on:
-            return {} if self.training else instances
-        if self.training:
-            # head is only trained on positive proposals.
-            instances, _ = select_foreground_proposals(instances, self.num_classes)
-        if self.mask_pooler is not None:
-            features = [features[f] for f in self.mask_in_features]
-            boxes = [x.proposal_boxes if self.training else x.pred_boxes for x in instances]
-            features = self.mask_pooler(features, boxes)
-        else:
-            features = {f: features[f] for f in self.mask_in_features}
-        return self.mask_head(features, instances)
-    def _forward_keypoint(self, features: Dict[str, torch.Tensor], instances: List[Instances]):
-        """
-        Forward logic of the keypoint prediction branch.
-        Args:
-            features (dict[str, Tensor]): mapping from feature map names to tensor.
-                Same as in :meth:`ROIHeads.forward`.
-            instances (list[Instances]): the per-image instances to train/predict keypoints.
-                In training, they can be the proposals.
-                In inference, they can be the boxes predicted by R-CNN box head.
-        Returns:
-            In training, a dict of losses.
-            In inference, update `instances` with new fields "pred_keypoints" and return it.
-        """
-        if not self.keypoint_on:
-            return {} if self.training else instances
-        if self.training:
-            # head is only trained on positive proposals with >=1 visible keypoints.
-            instances, _ = select_foreground_proposals(instances, self.num_classes)
-            instances = select_proposals_with_visible_keypoints(instances)
-        if self.keypoint_pooler is not None:
-            features = [features[f] for f in self.keypoint_in_features]
-            boxes = [x.proposal_boxes if self.training else x.pred_boxes for x in instances]
-            features = self.keypoint_pooler(features, boxes)
-        else:
-            features = {f: features[f] for f in self.keypoint_in_features}
-        return self.keypoint_head(features, instances)

cutler/solver/__init__.py DELETED Viewed

@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-from .build import build_lr_scheduler, build_optimizer, get_default_optimizer_params
-__all__ = [k for k in globals().keys() if not k.startswith("_")]