Ryan241005 commited on
Commit
df54550
·
verified ·
1 Parent(s): 8a57a5a

Upload folder using huggingface_hub

Browse files
Pretrain/README.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - vision
5
+ inference: false
6
+ pipeline_tag: zero-shot-object-detection
7
+ ---
8
+
9
+ # Grounding DINO model (tiny variant)
10
+
11
+ The Grounding DINO model was proposed in [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. Grounding DINO extends a closed-set object detection model with a text encoder, enabling open-set object detection. The model achieves remarkable results, such as 52.5 AP on COCO zero-shot.
12
+
13
+ <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/grouding_dino_architecture.png"
14
+ alt="drawing" width="600"/>
15
+
16
+ <small> Grounding DINO overview. Taken from the <a href="https://arxiv.org/abs/2303.05499">original paper</a>. </small>
17
+
18
+ ## Intended uses & limitations
19
+
20
+ You can use the raw model for zero-shot object detection (the task of detecting things in an image out-of-the-box without labeled data).
21
+
22
+ ### How to use
23
+
24
+ Here's how to use the model for zero-shot object detection:
25
+
26
+ ```python
27
+ import requests
28
+
29
+ import torch
30
+ from PIL import Image
31
+ from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
32
+
33
+ model_id = "IDEA-Research/grounding-dino-tiny"
34
+ device = "cuda" if torch.cuda.is_available() else "cpu"
35
+
36
+ processor = AutoProcessor.from_pretrained(model_id)
37
+ model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
38
+
39
+ image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
40
+ image = Image.open(requests.get(image_url, stream=True).raw)
41
+ # Check for cats and remote controls
42
+ # VERY important: text queries need to be lowercased + end with a dot
43
+ text = "a cat. a remote control."
44
+
45
+ inputs = processor(images=image, text=text, return_tensors="pt").to(device)
46
+ with torch.no_grad():
47
+ outputs = model(**inputs)
48
+
49
+ results = processor.post_process_grounded_object_detection(
50
+ outputs,
51
+ inputs.input_ids,
52
+ box_threshold=0.4,
53
+ text_threshold=0.3,
54
+ target_sizes=[image.size[::-1]]
55
+ )
56
+ ```
57
+
58
+ ### BibTeX entry and citation info
59
+
60
+ ```bibtex
61
+ @misc{liu2023grounding,
62
+ title={Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection},
63
+ author={Shilong Liu and Zhaoyang Zeng and Tianhe Ren and Feng Li and Hao Zhang and Jie Yang and Chunyuan Li and Jianwei Yang and Hang Su and Jun Zhu and Lei Zhang},
64
+ year={2023},
65
+ eprint={2303.05499},
66
+ archivePrefix={arXiv},
67
+ primaryClass={cs.CV}
68
+ }
69
+ ```
Pretrain/added_tokens.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "[CLS]": 101,
3
+ "[MASK]": 103,
4
+ "[PAD]": 0,
5
+ "[SEP]": 102,
6
+ "[UNK]": 100
7
+ }
Pretrain/config.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "activation_function": "relu",
4
+ "architectures": [
5
+ "GroundingDinoForObjectDetection"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "auxiliary_loss": false,
9
+ "backbone": null,
10
+ "backbone_config": {
11
+ "depths": [
12
+ 2,
13
+ 2,
14
+ 6,
15
+ 2
16
+ ],
17
+ "model_type": "swin",
18
+ "num_heads": [
19
+ 3,
20
+ 6,
21
+ 12,
22
+ 24
23
+ ],
24
+ "out_features": [
25
+ "stage2",
26
+ "stage3",
27
+ "stage4"
28
+ ],
29
+ "out_indices": [
30
+ 2,
31
+ 3,
32
+ 4
33
+ ]
34
+ },
35
+ "backbone_kwargs": null,
36
+ "bbox_cost": 5.0,
37
+ "bbox_loss_coefficient": 5.0,
38
+ "class_cost": 1.0,
39
+ "d_model": 256,
40
+ "decoder_attention_heads": 8,
41
+ "decoder_bbox_embed_share": true,
42
+ "decoder_ffn_dim": 2048,
43
+ "decoder_layers": 6,
44
+ "decoder_n_points": 4,
45
+ "disable_custom_kernels": false,
46
+ "dropout": 0.1,
47
+ "embedding_init_target": true,
48
+ "encoder_attention_heads": 8,
49
+ "encoder_ffn_dim": 2048,
50
+ "encoder_layers": 6,
51
+ "encoder_n_points": 4,
52
+ "focal_alpha": 0.25,
53
+ "fusion_dropout": 0.0,
54
+ "fusion_droppath": 0.1,
55
+ "giou_cost": 2.0,
56
+ "giou_loss_coefficient": 2.0,
57
+ "init_std": 0.02,
58
+ "is_encoder_decoder": true,
59
+ "layer_norm_eps": 1e-05,
60
+ "max_text_len": 256,
61
+ "model_type": "grounding-dino",
62
+ "num_feature_levels": 4,
63
+ "num_queries": 900,
64
+ "position_embedding_type": "sine",
65
+ "positional_embedding_temperature": 20,
66
+ "query_dim": 4,
67
+ "text_config": {
68
+ "model_type": "bert"
69
+ },
70
+ "text_enhancer_dropout": 0.0,
71
+ "torch_dtype": "float32",
72
+ "transformers_version": "4.40.0.dev0",
73
+ "two_stage": true,
74
+ "two_stage_bbox_embed_share": false,
75
+ "use_pretrained_backbone": false,
76
+ "use_timm_backbone": false
77
+ }
Pretrain/gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Pretrain/preprocessor_config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_pad": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "format": "coco_detection",
7
+ "image_mean": [
8
+ 0.485,
9
+ 0.456,
10
+ 0.406
11
+ ],
12
+ "image_processor_type": "GroundingDinoImageProcessor",
13
+ "image_std": [
14
+ 0.229,
15
+ 0.224,
16
+ 0.225
17
+ ],
18
+ "processor_class": "GroundingDinoProcessor",
19
+ "resample": 2,
20
+ "rescale_factor": 0.00392156862745098,
21
+ "size": {
22
+ "longest_edge": 1333,
23
+ "shortest_edge": 800
24
+ }
25
+ }
Pretrain/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96df8170959d691ca6bdbe75676dafe5c37d29e48970a666461f755c4fa707be
3
+ size 684703744
Pretrain/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
Pretrain/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
Pretrain/tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "mask_token": "[MASK]",
48
+ "model_max_length": 512,
49
+ "pad_token": "[PAD]",
50
+ "processor_class": "GroundingDinoProcessor",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "BertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
Pretrain/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/download_ckpts.sh ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ # All rights reserved.
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+ # Use either wget or curl to download the checkpoints
7
+ CMD="wget"
8
+ #if command -v wget &> /dev/null; then
9
+ # CMD="wget"
10
+ #elif command -v curl &> /dev/null; then
11
+ # CMD="curl -L -O"
12
+ #else
13
+ # echo "Please install wget or curl to download the checkpoints."
14
+ # exit 1
15
+ #fi
16
+ # Define the URLs for SAM 2 checkpoints
17
+ # SAM2_BASE_URL="https://dl.fbaipublicfiles.com/segment_anything_2/072824"
18
+ # sam2_hiera_t_url="${SAM2_BASE_URL}/sam2_hiera_tiny.pt"
19
+ # sam2_hiera_s_url="${SAM2_BASE_URL}/sam2_hiera_small.pt"
20
+ # sam2_hiera_b_plus_url="${SAM2_BASE_URL}/sam2_hiera_base_plus.pt"
21
+ # sam2_hiera_l_url="${SAM2_BASE_URL}/sam2_hiera_large.pt"
22
+ # Download each of the four checkpoints using wget
23
+ # echo "Downloading sam2_hiera_tiny.pt checkpoint..."
24
+ # $CMD $sam2_hiera_t_url || { echo "Failed to download checkpoint from $sam2_hiera_t_url"; exit 1; }
25
+ # echo "Downloading sam2_hiera_small.pt checkpoint..."
26
+ # $CMD $sam2_hiera_s_url || { echo "Failed to download checkpoint from $sam2_hiera_s_url"; exit 1; }
27
+ # echo "Downloading sam2_hiera_base_plus.pt checkpoint..."
28
+ # $CMD $sam2_hiera_b_plus_url || { echo "Failed to download checkpoint from $sam2_hiera_b_plus_url"; exit 1; }
29
+ # echo "Downloading sam2_hiera_large.pt checkpoint..."
30
+ # $CMD $sam2_hiera_l_url || { echo "Failed to download checkpoint from $sam2_hiera_l_url"; exit 1; }
31
+ # Define the URLs for SAM 2.1 checkpoints
32
+ SAM2p1_BASE_URL="https://dl.fbaipublicfiles.com/segment_anything_2/092824"
33
+ sam2p1_hiera_t_url="${SAM2p1_BASE_URL}/sam2.1_hiera_tiny.pt"
34
+ sam2p1_hiera_s_url="${SAM2p1_BASE_URL}/sam2.1_hiera_small.pt"
35
+ sam2p1_hiera_b_plus_url="${SAM2p1_BASE_URL}/sam2.1_hiera_base_plus.pt"
36
+ sam2p1_hiera_l_url="${SAM2p1_BASE_URL}/sam2.1_hiera_large.pt"
37
+ # SAM 2.1 checkpoints
38
+ echo "Downloading sam2.1_hiera_tiny.pt checkpoint..."
39
+ $CMD $sam2p1_hiera_t_url || { echo "Failed to download checkpoint from $sam2p1_hiera_t_url"; exit 1; }
40
+
41
+ echo "Downloading sam2.1_hiera_small.pt checkpoint..."
42
+ $CMD $sam2p1_hiera_s_url || { echo "Failed to download checkpoint from $sam2p1_hiera_s_url"; exit 1; }
43
+
44
+ echo "Downloading sam2.1_hiera_base_plus.pt checkpoint..."
45
+ $CMD $sam2p1_hiera_b_plus_url || { echo "Failed to download checkpoint from $sam2p1_hiera_b_plus_url"; exit 1; }
46
+
47
+ echo "Downloading sam2.1_hiera_large.pt checkpoint..."
48
+ $CMD $sam2p1_hiera_l_url || { echo "Failed to download checkpoint from $sam2p1_hiera_l_url"; exit 1; }
49
+ #echo "All checkpoints are downloaded successfully."
checkpoints/sam2.1_hiera_base_plus.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2345aede8715ab1d5d31b4a509fb160c5a4af1970f199d9054ccfb746c004c5
3
+ size 323606802
checkpoints/sam2.1_hiera_large.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2647878d5dfa5098f2f8649825738a9345572bae2d4350a2468587ece47dd318
3
+ size 898083611
checkpoints/sam2.1_hiera_small.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d1aa6f30de5c92224f8172114de081d104bbd23dd9dc5c58996f0cad5dc4d38
3
+ size 184416285
checkpoints/sam2.1_hiera_tiny.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7402e0d864fa82708a20fbd15bc84245c2f26dff0eb43a4b5b93452deb34be69
3
+ size 156008466
exps/demo/checkpoint.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b03c40259bfbdfc96e490124726ec7db9b2d8e8921f1c68391a9bb29e9ed1220
3
+ size 736657806
gdino_checkpoints/download_ckpts.sh ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
4
+ # All rights reserved.
5
+
6
+ # This source code is licensed under the license found in the
7
+ # LICENSE file in the root directory of this source tree.
8
+
9
+
10
+ # Define the URLs for the checkpoints
11
+ BASE_URL="https://github.com/IDEA-Research/GroundingDINO/releases/download/"
12
+ swint_ogc_url="${BASE_URL}v0.1.0-alpha/groundingdino_swint_ogc.pth"
13
+ swinb_cogcoor_url="${BASE_URL}v0.1.0-alpha2/groundingdino_swinb_cogcoor.pth"
14
+
15
+
16
+
17
+ # Download each of the four checkpoints using wget
18
+ echo "Downloading groundingdino_swint_ogc.pth checkpoint..."
19
+ wget $swint_ogc_url || { echo "Failed to download checkpoint from $swint_ogc_url"; exit 1; }
20
+
21
+ echo "Downloading groundingdino_swinb_cogcoor.pth checkpoint..."
22
+ wget $swinb_cogcoor_url || { echo "Failed to download checkpoint from $swinb_cogcoor_url"; exit 1; }
23
+
24
+ echo "All checkpoints are downloaded successfully."
gdino_checkpoints/groundingdino_swinb_cogcoor.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46270f7a822e6906b655b729c90613e48929d0f2bb8b9b76fd10a856f3ac6ab7
3
+ size 938057991
gdino_checkpoints/groundingdino_swint_ogc.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b3ca2563c77c69f651d7bd133e97139c186df06231157a64c507099c52bc799
3
+ size 693997677