Spaces:
Running on T4
Running on T4
TIPSv2 Feature Explorer
Browse filesExplore TIPSv2 representations: PCA visualization, zero-shot segmentation,
DPT depth/normals estimation, and supervised segmentation (ADE20K).
- .gitattributes +32 -0
- .gitignore +3 -0
- README.md +7 -7
- app.py +1020 -0
- examples/depth/ade20k_00003.png +3 -0
- examples/depth/ade20k_00007.png +3 -0
- examples/depth/ade20k_00014.png +3 -0
- examples/depth/ade20k_00022.png +3 -0
- examples/nyuv2/bedroom_00280.jpg +3 -0
- examples/nyuv2/kitchen_00249.jpg +3 -0
- examples/nyuv2/living_room_01260.jpg +3 -0
- examples/nyuv2/office_kitchen_00413.jpg +3 -0
- examples/nyuv2/study_room_00272.jpg +3 -0
- examples/pca/angus.jpeg +3 -0
- examples/pca/cph.jpeg +3 -0
- examples/pca/dadaocheng.jpeg +3 -0
- examples/pca/hike.jpeg +3 -0
- examples/zeroseg/pascal_context_00000_image.png +3 -0
- examples/zeroseg/pascal_context_00007_image.png +3 -0
- examples/zeroseg/pascal_context_00049_image.png +3 -0
- examples/zeroseg/voc_2008_000891.jpg +3 -0
- requirements.txt +10 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,35 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
examples/beach.jpg filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
examples/building.jpg filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
examples/city.jpg filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
examples/dog_park.jpg filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
examples/bedroom.jpg filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
examples/pca/angus.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
examples/pca/cph.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
examples/pca/dadaocheng.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
examples/pca/eiffel_tower.jpg filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
examples/pca/hike.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
examples/zeroseg/bus.png filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
examples/zeroseg/birds.png filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
examples/zeroseg/bicycle.png filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
examples/zeroseg/baby.png filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
examples/zeroseg/dog.png filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
examples/zeroseg/sleep.png filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
examples/zeroseg/pc_00106.png filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
examples/zeroseg/pc_00107.png filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
examples/zeroseg/pc_00108.png filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
examples/zeroseg/pc_00109.png filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
examples/zeroseg/pc_00110.png filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
examples/depth/*.png filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
examples/zeroseg/pascal_context_00007.png filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
examples/zeroseg/pascal_context_00029.png filter=lfs diff=lfs merge=lfs -text
|
| 60 |
+
examples/zeroseg/pascal_context_00068.png filter=lfs diff=lfs merge=lfs -text
|
| 61 |
+
examples/zeroseg/*.png filter=lfs diff=lfs merge=lfs -text
|
| 62 |
+
examples/zeroseg_voc/*.jpg filter=lfs diff=lfs merge=lfs -text
|
| 63 |
+
examples/nyuv2/*.jpg filter=lfs diff=lfs merge=lfs -text
|
| 64 |
+
examples/zeroseg/pascal_context_*_image.png filter=lfs diff=lfs merge=lfs -text
|
| 65 |
+
examples/zeroseg/*.jpg filter=lfs diff=lfs merge=lfs -text
|
| 66 |
+
examples/pca/*.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 67 |
+
examples/pca/*.jpg filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.ruff_cache/
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
README.md
CHANGED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
---
|
| 2 |
-
title: TIPSv2
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: blue
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
-
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
| 1 |
---
|
| 2 |
+
title: TIPSv2 Feature Explorer
|
| 3 |
+
emoji: π
|
| 4 |
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 5.49.1
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
license: apache-2.0
|
| 11 |
+
short_description: Explore TIPSv2 features, segmentations, depth and normals
|
| 12 |
+
---
|
app.py
ADDED
|
@@ -0,0 +1,1020 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""TIPS Feature Explorer (GPU) β Hugging Face Space demo with ZeroGPU."""
|
| 2 |
+
|
| 3 |
+
import colorsys
|
| 4 |
+
|
| 5 |
+
import gradio as gr
|
| 6 |
+
import matplotlib.cm as cm
|
| 7 |
+
import matplotlib.pyplot as plt
|
| 8 |
+
import numpy as np
|
| 9 |
+
import spaces
|
| 10 |
+
import torch
|
| 11 |
+
import torch.nn.functional as F
|
| 12 |
+
from PIL import Image, ImageDraw, ImageFont
|
| 13 |
+
from fast_pytorch_kmeans import KMeans as TorchKMeans
|
| 14 |
+
from sklearn.decomposition import PCA
|
| 15 |
+
from torchvision import transforms
|
| 16 |
+
from transformers import AutoModel
|
| 17 |
+
|
| 18 |
+
# ββ Constants βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 19 |
+
|
| 20 |
+
DEFAULT_IMAGE_SIZE = 896
|
| 21 |
+
PATCH_SIZE = 14
|
| 22 |
+
RESOLUTIONS = [224, 336, 448, 672, 896, 1120, 1372, 1792]
|
| 23 |
+
|
| 24 |
+
ZEROSEG_IMAGE_SIZE = 1372
|
| 25 |
+
MAX_LEN = 64
|
| 26 |
+
|
| 27 |
+
VARIANTS = {
|
| 28 |
+
"TIPS v2 β B/14": "google/tipsv2-b14-dpt",
|
| 29 |
+
"TIPS v2 β L/14": "google/tipsv2-l14-dpt",
|
| 30 |
+
"TIPS v2 β SO400m/14": "google/tipsv2-so400m14-dpt",
|
| 31 |
+
"TIPS v2 β g/14": "google/tipsv2-g14-dpt",
|
| 32 |
+
}
|
| 33 |
+
DEFAULT_VARIANT = "TIPS v2 β L/14"
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _device():
|
| 37 |
+
return torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# ββ Pascal Context (59 classes) βββββββββββββββββββββββββββββββββββββββββββββ
|
| 41 |
+
|
| 42 |
+
TCL_PROMPTS = [
|
| 43 |
+
"itap of a {}.",
|
| 44 |
+
"a bad photo of a {}.",
|
| 45 |
+
"a origami {}.",
|
| 46 |
+
"a photo of the large {}.",
|
| 47 |
+
"a {} in a video game.",
|
| 48 |
+
"art of the {}.",
|
| 49 |
+
"a photo of the small {}.",
|
| 50 |
+
"a photo of many {}.",
|
| 51 |
+
"a photo of {}s.",
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
PASCAL_CONTEXT_CLASSES = (
|
| 55 |
+
"aeroplane",
|
| 56 |
+
"bag",
|
| 57 |
+
"bed",
|
| 58 |
+
"bedclothes",
|
| 59 |
+
"bench",
|
| 60 |
+
"bicycle",
|
| 61 |
+
"bird",
|
| 62 |
+
"boat",
|
| 63 |
+
"book",
|
| 64 |
+
"bottle",
|
| 65 |
+
"building",
|
| 66 |
+
"bus",
|
| 67 |
+
"cabinet",
|
| 68 |
+
"car",
|
| 69 |
+
"cat",
|
| 70 |
+
"ceiling",
|
| 71 |
+
"chair",
|
| 72 |
+
"cloth",
|
| 73 |
+
"computer",
|
| 74 |
+
"cow",
|
| 75 |
+
"cup",
|
| 76 |
+
"curtain",
|
| 77 |
+
"dog",
|
| 78 |
+
"door",
|
| 79 |
+
"fence",
|
| 80 |
+
"floor",
|
| 81 |
+
"flower",
|
| 82 |
+
"food",
|
| 83 |
+
"grass",
|
| 84 |
+
"ground",
|
| 85 |
+
"horse",
|
| 86 |
+
"keyboard",
|
| 87 |
+
"light",
|
| 88 |
+
"motorbike",
|
| 89 |
+
"mountain",
|
| 90 |
+
"mouse",
|
| 91 |
+
"person",
|
| 92 |
+
"plate",
|
| 93 |
+
"platform",
|
| 94 |
+
"pottedplant",
|
| 95 |
+
"road",
|
| 96 |
+
"rock",
|
| 97 |
+
"sheep",
|
| 98 |
+
"shelves",
|
| 99 |
+
"sidewalk",
|
| 100 |
+
"sign",
|
| 101 |
+
"sky",
|
| 102 |
+
"snow",
|
| 103 |
+
"sofa",
|
| 104 |
+
"table",
|
| 105 |
+
"track",
|
| 106 |
+
"train",
|
| 107 |
+
"tree",
|
| 108 |
+
"truck",
|
| 109 |
+
"tvmonitor",
|
| 110 |
+
"wall",
|
| 111 |
+
"water",
|
| 112 |
+
"window",
|
| 113 |
+
"wood",
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
ADE20K_CLASSES = (
|
| 117 |
+
"wall",
|
| 118 |
+
"building",
|
| 119 |
+
"sky",
|
| 120 |
+
"floor",
|
| 121 |
+
"tree",
|
| 122 |
+
"ceiling",
|
| 123 |
+
"road",
|
| 124 |
+
"bed",
|
| 125 |
+
"windowpane",
|
| 126 |
+
"grass",
|
| 127 |
+
"cabinet",
|
| 128 |
+
"sidewalk",
|
| 129 |
+
"person",
|
| 130 |
+
"earth",
|
| 131 |
+
"door",
|
| 132 |
+
"table",
|
| 133 |
+
"mountain",
|
| 134 |
+
"plant",
|
| 135 |
+
"curtain",
|
| 136 |
+
"chair",
|
| 137 |
+
"car",
|
| 138 |
+
"water",
|
| 139 |
+
"painting",
|
| 140 |
+
"sofa",
|
| 141 |
+
"shelf",
|
| 142 |
+
"house",
|
| 143 |
+
"sea",
|
| 144 |
+
"mirror",
|
| 145 |
+
"rug",
|
| 146 |
+
"field",
|
| 147 |
+
"armchair",
|
| 148 |
+
"seat",
|
| 149 |
+
"fence",
|
| 150 |
+
"desk",
|
| 151 |
+
"rock",
|
| 152 |
+
"wardrobe",
|
| 153 |
+
"lamp",
|
| 154 |
+
"bathtub",
|
| 155 |
+
"railing",
|
| 156 |
+
"cushion",
|
| 157 |
+
"base",
|
| 158 |
+
"box",
|
| 159 |
+
"column",
|
| 160 |
+
"signboard",
|
| 161 |
+
"chest_of_drawers",
|
| 162 |
+
"counter",
|
| 163 |
+
"sand",
|
| 164 |
+
"sink",
|
| 165 |
+
"skyscraper",
|
| 166 |
+
"fireplace",
|
| 167 |
+
"refrigerator",
|
| 168 |
+
"grandstand",
|
| 169 |
+
"path",
|
| 170 |
+
"stairs",
|
| 171 |
+
"runway",
|
| 172 |
+
"case",
|
| 173 |
+
"pool_table",
|
| 174 |
+
"pillow",
|
| 175 |
+
"screen_door",
|
| 176 |
+
"stairway",
|
| 177 |
+
"river",
|
| 178 |
+
"bridge",
|
| 179 |
+
"bookcase",
|
| 180 |
+
"blind",
|
| 181 |
+
"coffee_table",
|
| 182 |
+
"toilet",
|
| 183 |
+
"flower",
|
| 184 |
+
"book",
|
| 185 |
+
"hill",
|
| 186 |
+
"bench",
|
| 187 |
+
"countertop",
|
| 188 |
+
"stove",
|
| 189 |
+
"palm",
|
| 190 |
+
"kitchen_island",
|
| 191 |
+
"computer",
|
| 192 |
+
"swivel_chair",
|
| 193 |
+
"boat",
|
| 194 |
+
"bar",
|
| 195 |
+
"arcade_machine",
|
| 196 |
+
"hovel",
|
| 197 |
+
"bus",
|
| 198 |
+
"towel",
|
| 199 |
+
"light",
|
| 200 |
+
"truck",
|
| 201 |
+
"tower",
|
| 202 |
+
"chandelier",
|
| 203 |
+
"awning",
|
| 204 |
+
"streetlight",
|
| 205 |
+
"booth",
|
| 206 |
+
"television",
|
| 207 |
+
"airplane",
|
| 208 |
+
"dirt_track",
|
| 209 |
+
"apparel",
|
| 210 |
+
"pole",
|
| 211 |
+
"land",
|
| 212 |
+
"bannister",
|
| 213 |
+
"escalator",
|
| 214 |
+
"ottoman",
|
| 215 |
+
"bottle",
|
| 216 |
+
"buffet",
|
| 217 |
+
"poster",
|
| 218 |
+
"stage",
|
| 219 |
+
"van",
|
| 220 |
+
"ship",
|
| 221 |
+
"fountain",
|
| 222 |
+
"conveyer_belt",
|
| 223 |
+
"canopy",
|
| 224 |
+
"washer",
|
| 225 |
+
"plaything",
|
| 226 |
+
"swimming_pool",
|
| 227 |
+
"stool",
|
| 228 |
+
"barrel",
|
| 229 |
+
"basket",
|
| 230 |
+
"waterfall",
|
| 231 |
+
"tent",
|
| 232 |
+
"bag",
|
| 233 |
+
"minibike",
|
| 234 |
+
"cradle",
|
| 235 |
+
"oven",
|
| 236 |
+
"ball",
|
| 237 |
+
"food",
|
| 238 |
+
"step",
|
| 239 |
+
"tank",
|
| 240 |
+
"trade_name",
|
| 241 |
+
"microwave",
|
| 242 |
+
"pot",
|
| 243 |
+
"animal",
|
| 244 |
+
"bicycle",
|
| 245 |
+
"lake",
|
| 246 |
+
"dishwasher",
|
| 247 |
+
"screen",
|
| 248 |
+
"blanket",
|
| 249 |
+
"sculpture",
|
| 250 |
+
"hood",
|
| 251 |
+
"sconce",
|
| 252 |
+
"vase",
|
| 253 |
+
"traffic_light",
|
| 254 |
+
"tray",
|
| 255 |
+
"ashcan",
|
| 256 |
+
"fan",
|
| 257 |
+
"pier",
|
| 258 |
+
"crt_screen",
|
| 259 |
+
"plate",
|
| 260 |
+
"monitor",
|
| 261 |
+
"bulletin_board",
|
| 262 |
+
"shower",
|
| 263 |
+
"radiator",
|
| 264 |
+
"glass",
|
| 265 |
+
"clock",
|
| 266 |
+
"flag",
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
NUM_ADE20K_CLASSES = 150
|
| 270 |
+
ADE20K_PALETTE = np.zeros((NUM_ADE20K_CLASSES + 1, 3), dtype=np.uint8)
|
| 271 |
+
for i in range(1, NUM_ADE20K_CLASSES + 1):
|
| 272 |
+
hue = (i * 0.618033988749895) % 1.0
|
| 273 |
+
saturation = 0.65 + 0.35 * ((i * 7) % 5) / 4.0
|
| 274 |
+
value = 0.70 + 0.30 * ((i * 11) % 3) / 2.0
|
| 275 |
+
r, g, b = colorsys.hsv_to_rgb(hue, saturation, value)
|
| 276 |
+
ADE20K_PALETTE[i] = [int(r * 255), int(g * 255), int(b * 255)]
|
| 277 |
+
|
| 278 |
+
# ββ Model state (one model loaded at a time) βββββββββββββββββββββββββββββββ
|
| 279 |
+
|
| 280 |
+
_model = {
|
| 281 |
+
"name": None,
|
| 282 |
+
"vision": None,
|
| 283 |
+
"text": None,
|
| 284 |
+
"tokenizer": None,
|
| 285 |
+
"temperature": None,
|
| 286 |
+
"ade20k_embs": None,
|
| 287 |
+
"dpt": None,
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
def load_variant(name):
|
| 292 |
+
"""Load a DPT model variant from HuggingFace (includes the backbone)."""
|
| 293 |
+
global _model
|
| 294 |
+
if _model["name"] == name:
|
| 295 |
+
return
|
| 296 |
+
dpt = AutoModel.from_pretrained(VARIANTS[name], trust_remote_code=True)
|
| 297 |
+
dpt.eval()
|
| 298 |
+
dpt._get_backbone() # trigger backbone download
|
| 299 |
+
backbone = dpt._backbone
|
| 300 |
+
_model.update(
|
| 301 |
+
name=name,
|
| 302 |
+
dpt=dpt,
|
| 303 |
+
vision=backbone.vision_encoder,
|
| 304 |
+
text=backbone.text_encoder,
|
| 305 |
+
tokenizer=backbone._load_tokenizer(),
|
| 306 |
+
temperature=backbone.config.temperature,
|
| 307 |
+
ade20k_embs=None,
|
| 308 |
+
)
|
| 309 |
+
print(f"Loaded {name}")
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
def _move_models_to_device():
|
| 313 |
+
"""Move models to the current device (GPU inside @spaces.GPU, else CPU)."""
|
| 314 |
+
dev = _device()
|
| 315 |
+
if _model["vision"] is not None:
|
| 316 |
+
_model["vision"].to(dev)
|
| 317 |
+
if _model["text"] is not None:
|
| 318 |
+
_model["text"].to(dev)
|
| 319 |
+
if _model["dpt"] is not None:
|
| 320 |
+
_model["dpt"].to(dev)
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
def _ensure_ade20k_embs():
|
| 324 |
+
"""Pre-compute Pascal Context text embeddings if not yet done (must run on GPU)."""
|
| 325 |
+
if _model["ade20k_embs"] is not None:
|
| 326 |
+
return
|
| 327 |
+
dev = _device()
|
| 328 |
+
model_t = _model["text"]
|
| 329 |
+
tokenizer = _model["tokenizer"]
|
| 330 |
+
all_embs = []
|
| 331 |
+
for template in TCL_PROMPTS:
|
| 332 |
+
prompts = [template.format(c) for c in PASCAL_CONTEXT_CLASSES]
|
| 333 |
+
ids, paddings = tokenizer.tokenize(prompts, max_len=MAX_LEN)
|
| 334 |
+
with torch.no_grad():
|
| 335 |
+
embs = model_t(
|
| 336 |
+
torch.from_numpy(ids).to(dev),
|
| 337 |
+
torch.from_numpy(paddings).to(dev),
|
| 338 |
+
)
|
| 339 |
+
all_embs.append(embs.cpu().numpy())
|
| 340 |
+
_model["ade20k_embs"] = l2_normalize(np.mean(all_embs, axis=0))
|
| 341 |
+
print("Pascal Context text embeddings computed.")
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
def _init_model():
|
| 345 |
+
"""Load model + move to GPU + compute text embeddings."""
|
| 346 |
+
load_variant(_model["name"] or DEFAULT_VARIANT)
|
| 347 |
+
_move_models_to_device()
|
| 348 |
+
_ensure_ade20k_embs()
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
# ββ Preprocessing & helpers βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
def preprocess(img, size=DEFAULT_IMAGE_SIZE):
|
| 355 |
+
return transforms.Compose(
|
| 356 |
+
[
|
| 357 |
+
transforms.Resize((size, size)),
|
| 358 |
+
transforms.ToTensor(),
|
| 359 |
+
]
|
| 360 |
+
)(img)
|
| 361 |
+
|
| 362 |
+
|
| 363 |
+
def l2_normalize(x, axis=-1):
|
| 364 |
+
return x / np.linalg.norm(x, ord=2, axis=axis, keepdims=True).clip(min=1e-3)
|
| 365 |
+
|
| 366 |
+
|
| 367 |
+
def upsample(arr, h, w, mode="bilinear"):
|
| 368 |
+
"""Upsample (H, W, C) or (H, W) numpy array to (h, w, ...)."""
|
| 369 |
+
t = torch.from_numpy(arr).float()
|
| 370 |
+
if t.ndim == 2:
|
| 371 |
+
t = t.unsqueeze(-1)
|
| 372 |
+
t = t.permute(2, 0, 1).unsqueeze(0)
|
| 373 |
+
kwargs = dict(align_corners=False) if mode == "bilinear" else {}
|
| 374 |
+
up = F.interpolate(t, size=(h, w), mode=mode, **kwargs)
|
| 375 |
+
return up[0].permute(1, 2, 0).numpy()
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
def to_uint8(x):
|
| 379 |
+
return (x * 255).clip(0, 255).astype(np.uint8)
|
| 380 |
+
|
| 381 |
+
|
| 382 |
+
# ββ Feature extraction (GPU-accelerated) ββββββββββββββββββββββββββββββββββββ
|
| 383 |
+
|
| 384 |
+
|
| 385 |
+
@torch.no_grad()
|
| 386 |
+
def extract_features(image_np, resolution=DEFAULT_IMAGE_SIZE):
|
| 387 |
+
"""Return spatial features (sp, sp, D) as numpy. sp = resolution // 14."""
|
| 388 |
+
dev = _device()
|
| 389 |
+
img = Image.fromarray(image_np).convert("RGB")
|
| 390 |
+
tensor = preprocess(img, resolution).unsqueeze(0).to(dev)
|
| 391 |
+
_, _, patch_tokens = _model["vision"](tensor)
|
| 392 |
+
sp = resolution // PATCH_SIZE
|
| 393 |
+
return patch_tokens.cpu().reshape(sp, sp, -1).numpy()
|
| 394 |
+
|
| 395 |
+
|
| 396 |
+
@torch.no_grad()
|
| 397 |
+
def extract_features_value_attention(image_np, resolution=ZEROSEG_IMAGE_SIZE):
|
| 398 |
+
"""Return spatial features (sp, sp, D) using Value Attention on GPU.
|
| 399 |
+
|
| 400 |
+
This follows the Colab reference implementation: run all blocks except the
|
| 401 |
+
last normally, then for the last block extract V from QKV and manually
|
| 402 |
+
apply out_proj, layer scale, residual, norm2, MLP + layer scale, second
|
| 403 |
+
residual, and final norm.
|
| 404 |
+
"""
|
| 405 |
+
dev = _device()
|
| 406 |
+
model_image = _model["vision"]
|
| 407 |
+
img = Image.fromarray(image_np).convert("RGB")
|
| 408 |
+
tensor = preprocess(img, resolution).unsqueeze(0).to(dev)
|
| 409 |
+
|
| 410 |
+
x = model_image.prepare_tokens_with_masks(tensor)
|
| 411 |
+
|
| 412 |
+
for blk in model_image.blocks[:-1]:
|
| 413 |
+
x = blk(x)
|
| 414 |
+
|
| 415 |
+
blk = model_image.blocks[-1]
|
| 416 |
+
num_reg = getattr(model_image, "num_register_tokens", 1)
|
| 417 |
+
|
| 418 |
+
b_dim, n_dim, c_dim = x.shape
|
| 419 |
+
num_heads = blk.attn.num_heads
|
| 420 |
+
qkv = blk.attn.qkv(blk.norm1(x))
|
| 421 |
+
qkv = qkv.reshape(b_dim, n_dim, 3, num_heads, c_dim // num_heads)
|
| 422 |
+
qkv = qkv.permute(2, 0, 3, 1, 4) # (3, B, H, N, D_head)
|
| 423 |
+
|
| 424 |
+
v = qkv[2] # (B, H, N, D_head)
|
| 425 |
+
v_out = v.transpose(1, 2).reshape(b_dim, n_dim, c_dim)
|
| 426 |
+
v_out = blk.attn.proj(v_out)
|
| 427 |
+
v_out = blk.ls1(v_out)
|
| 428 |
+
x_val = v_out + x
|
| 429 |
+
|
| 430 |
+
y_val = blk.norm2(x_val)
|
| 431 |
+
y_val = blk.ls2(blk.mlp(y_val))
|
| 432 |
+
x_val = x_val + y_val
|
| 433 |
+
|
| 434 |
+
x_val = model_image.norm(x_val)
|
| 435 |
+
|
| 436 |
+
patch_tokens = x_val[:, 1 + num_reg :, :]
|
| 437 |
+
sp = resolution // PATCH_SIZE
|
| 438 |
+
spatial = patch_tokens.cpu().reshape(sp, sp, -1).numpy()
|
| 439 |
+
return spatial
|
| 440 |
+
|
| 441 |
+
|
| 442 |
+
# ββ PCA Visualisations ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 443 |
+
|
| 444 |
+
|
| 445 |
+
def vis_pca(spatial):
|
| 446 |
+
"""PCA of spatial features β RGB image."""
|
| 447 |
+
feat = spatial.reshape(-1, spatial.shape[-1])
|
| 448 |
+
pca = PCA(n_components=3, whiten=True)
|
| 449 |
+
h, w = spatial.shape[0], spatial.shape[1]
|
| 450 |
+
rgb = pca.fit_transform(feat).reshape(h, w, 3)
|
| 451 |
+
rgb = 1 / (1 + np.exp(-2.0 * rgb))
|
| 452 |
+
return to_uint8(rgb)
|
| 453 |
+
|
| 454 |
+
|
| 455 |
+
def vis_depth(spatial):
|
| 456 |
+
"""1st PCA component visualized with inferno colormap."""
|
| 457 |
+
feat = spatial.reshape(-1, spatial.shape[-1])
|
| 458 |
+
h, w = spatial.shape[0], spatial.shape[1]
|
| 459 |
+
depth = PCA(n_components=1).fit_transform(feat).reshape(h, w)
|
| 460 |
+
depth = (depth - depth.min()) / (depth.max() - depth.min() + 1e-8)
|
| 461 |
+
colored = cm.get_cmap("inferno")(depth)[:, :, :3].astype(np.float32)
|
| 462 |
+
return to_uint8(colored)
|
| 463 |
+
|
| 464 |
+
|
| 465 |
+
def vis_kmeans(spatial, h, w, n_clusters=6):
|
| 466 |
+
"""K-means clustering of spatial features."""
|
| 467 |
+
sp_h, sp_w = spatial.shape[:2]
|
| 468 |
+
feat = torch.from_numpy(spatial.reshape(-1, spatial.shape[-1])).to(_device())
|
| 469 |
+
km = TorchKMeans(n_clusters=n_clusters, max_iter=20)
|
| 470 |
+
km.fit(feat)
|
| 471 |
+
dists = -torch.cdist(feat, km.centroids) # (H*W, k)
|
| 472 |
+
scores = dists.cpu().numpy().reshape(sp_h, sp_w, n_clusters)
|
| 473 |
+
scores_up = upsample(scores, h, w, mode="bilinear")
|
| 474 |
+
labels = scores_up.argmax(axis=-1)
|
| 475 |
+
palette = plt.cm.tab20(np.linspace(0, 1, n_clusters))[:, :3]
|
| 476 |
+
seg = palette[labels].astype(np.float32)
|
| 477 |
+
return to_uint8(seg)
|
| 478 |
+
|
| 479 |
+
|
| 480 |
+
# ββ Zero-shot Segmentation ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 481 |
+
|
| 482 |
+
|
| 483 |
+
def vis_custom_semseg(spatial, orig_image, classes, class_embs):
|
| 484 |
+
"""Zero-shot semantic segmentation with user-defined classes."""
|
| 485 |
+
h, w = orig_image.shape[:2]
|
| 486 |
+
sp_h, sp_w = spatial.shape[:2]
|
| 487 |
+
n = len(classes)
|
| 488 |
+
|
| 489 |
+
feat = l2_normalize(spatial.reshape(-1, spatial.shape[-1]))
|
| 490 |
+
sim = feat @ class_embs.T
|
| 491 |
+
sim_map = sim.reshape(sp_h, sp_w, n)
|
| 492 |
+
|
| 493 |
+
sim_up = upsample(sim_map, h, w, mode="bilinear")
|
| 494 |
+
labels = sim_up.argmax(axis=-1)
|
| 495 |
+
|
| 496 |
+
palette = (plt.cm.tab20(np.linspace(0, 1, max(n, 2)))[:n, :3] * 255).astype(
|
| 497 |
+
np.uint8
|
| 498 |
+
)
|
| 499 |
+
|
| 500 |
+
seg_rgb = palette[labels].astype(np.float32) / 255.0
|
| 501 |
+
mask_img = to_uint8(seg_rgb)
|
| 502 |
+
|
| 503 |
+
blend = 0.1 * orig_image.astype(np.float32) / 255.0 + 0.9 * seg_rgb
|
| 504 |
+
blend_img = Image.fromarray(to_uint8(blend))
|
| 505 |
+
|
| 506 |
+
unique_ids, counts = np.unique(labels, return_counts=True)
|
| 507 |
+
order = np.argsort(-counts)
|
| 508 |
+
unique_ids, counts = unique_ids[order], counts[order]
|
| 509 |
+
total = counts.sum()
|
| 510 |
+
|
| 511 |
+
try:
|
| 512 |
+
font = ImageFont.truetype(
|
| 513 |
+
"/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
|
| 514 |
+
60,
|
| 515 |
+
)
|
| 516 |
+
except OSError:
|
| 517 |
+
font = ImageFont.load_default()
|
| 518 |
+
|
| 519 |
+
n_legend = min(len(unique_ids), 10)
|
| 520 |
+
row_h = 80
|
| 521 |
+
swatch_w = 60
|
| 522 |
+
pad = 12
|
| 523 |
+
legend_w = 450
|
| 524 |
+
|
| 525 |
+
legend_h = max(h, n_legend * row_h + pad * 2)
|
| 526 |
+
canvas = Image.new("RGB", (w + legend_w, legend_h), (255, 255, 255))
|
| 527 |
+
canvas.paste(blend_img, (0, 0))
|
| 528 |
+
draw = ImageDraw.Draw(canvas)
|
| 529 |
+
|
| 530 |
+
for i in range(n_legend):
|
| 531 |
+
cid = unique_ids[i]
|
| 532 |
+
color = tuple(palette[cid].tolist())
|
| 533 |
+
y_top = pad + i * row_h
|
| 534 |
+
draw.rectangle(
|
| 535 |
+
[w + pad, y_top, w + pad + swatch_w, y_top + swatch_w],
|
| 536 |
+
fill=color,
|
| 537 |
+
outline=(0, 0, 0),
|
| 538 |
+
)
|
| 539 |
+
draw.text(
|
| 540 |
+
(w + pad + swatch_w + 8, y_top + 6),
|
| 541 |
+
classes[cid],
|
| 542 |
+
fill="black",
|
| 543 |
+
font=font,
|
| 544 |
+
)
|
| 545 |
+
|
| 546 |
+
overlay_out = np.array(canvas)
|
| 547 |
+
|
| 548 |
+
detected_parts, minor_parts = [], []
|
| 549 |
+
for i, cid in enumerate(unique_ids):
|
| 550 |
+
pct = counts[i] / total * 100
|
| 551 |
+
if pct >= 2:
|
| 552 |
+
detected_parts.append(f"{classes[cid]} ({pct:.1f}%)")
|
| 553 |
+
else:
|
| 554 |
+
minor_parts.append(f"{classes[cid]} ({pct:.1f}%)")
|
| 555 |
+
absent = [
|
| 556 |
+
f"{classes[i]} (0.0%)" for i in range(n) if i not in set(unique_ids.tolist())
|
| 557 |
+
]
|
| 558 |
+
detected_str = ", ".join(detected_parts)
|
| 559 |
+
undetected_str = ", ".join(minor_parts + absent)
|
| 560 |
+
return overlay_out, mask_img, detected_str, undetected_str
|
| 561 |
+
|
| 562 |
+
|
| 563 |
+
# ββ DPT Depth Inference βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 564 |
+
|
| 565 |
+
|
| 566 |
+
def vis_depth_dpt(depth_map, h, w):
|
| 567 |
+
"""Colour a depth map with the turbo colormap β PIL Image."""
|
| 568 |
+
d = depth_map.squeeze()
|
| 569 |
+
d = (d - d.min()) / (d.max() - d.min() + 1e-8)
|
| 570 |
+
colored = cm.get_cmap("turbo")(d)[:, :, :3].astype(np.float32)
|
| 571 |
+
return to_uint8(upsample(colored, h, w))
|
| 572 |
+
|
| 573 |
+
|
| 574 |
+
def vis_normals_dpt(normals_map, h, w):
|
| 575 |
+
"""Map normals from [-1, 1] to [0, 1] and resize to original size."""
|
| 576 |
+
n = normals_map.cpu().numpy()
|
| 577 |
+
n = (n + 1.0) / 2.0
|
| 578 |
+
n = np.transpose(n, (1, 2, 0)) # (H, W, 3)
|
| 579 |
+
return to_uint8(upsample(n, h, w))
|
| 580 |
+
|
| 581 |
+
|
| 582 |
+
def vis_segmentation_dpt(seg_map, orig_image):
|
| 583 |
+
"""Colour a segmentation map with the ADE20K colormap + legend."""
|
| 584 |
+
h, w = orig_image.shape[:2]
|
| 585 |
+
logits = seg_map.cpu().numpy().transpose(1, 2, 0) # (H, W, 150)
|
| 586 |
+
logits_up = upsample(logits, h, w, mode="bilinear")
|
| 587 |
+
pred = logits_up.argmax(axis=-1) # (h, w)
|
| 588 |
+
seg_rgb = ADE20K_PALETTE[pred.astype(np.int32) + 1].astype(np.float32) / 255.0
|
| 589 |
+
|
| 590 |
+
blend = 0.15 * orig_image.astype(np.float32) / 255.0 + 0.85 * seg_rgb
|
| 591 |
+
blend_img = Image.fromarray(to_uint8(blend))
|
| 592 |
+
|
| 593 |
+
unique_ids, counts = np.unique(pred, return_counts=True)
|
| 594 |
+
total_pixels = counts.sum()
|
| 595 |
+
order = np.argsort(-counts)
|
| 596 |
+
unique_ids, counts = unique_ids[order], counts[order]
|
| 597 |
+
|
| 598 |
+
pcts = counts / total_pixels * 100
|
| 599 |
+
mask = pcts >= 2.0
|
| 600 |
+
unique_ids, counts, pcts = unique_ids[mask], counts[mask], pcts[mask]
|
| 601 |
+
|
| 602 |
+
try:
|
| 603 |
+
font = ImageFont.truetype(
|
| 604 |
+
"/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
|
| 605 |
+
36,
|
| 606 |
+
)
|
| 607 |
+
except OSError:
|
| 608 |
+
font = ImageFont.load_default()
|
| 609 |
+
|
| 610 |
+
n_legend = min(len(unique_ids), 10)
|
| 611 |
+
row_h, swatch_w, pad, legend_w = 50, 40, 10, 450
|
| 612 |
+
legend_h = max(h, n_legend * row_h + pad * 2)
|
| 613 |
+
canvas = Image.new("RGB", (w + legend_w, legend_h), (255, 255, 255))
|
| 614 |
+
canvas.paste(blend_img, (0, 0))
|
| 615 |
+
draw = ImageDraw.Draw(canvas)
|
| 616 |
+
|
| 617 |
+
for i in range(n_legend):
|
| 618 |
+
cid = unique_ids[i]
|
| 619 |
+
color = tuple(ADE20K_PALETTE[cid + 1].tolist())
|
| 620 |
+
name = ADE20K_CLASSES[cid] if cid < len(ADE20K_CLASSES) else f"class_{cid}"
|
| 621 |
+
y_top = pad + i * row_h
|
| 622 |
+
draw.rectangle(
|
| 623 |
+
[w + pad, y_top, w + pad + swatch_w, y_top + swatch_w],
|
| 624 |
+
fill=color,
|
| 625 |
+
outline=(0, 0, 0),
|
| 626 |
+
)
|
| 627 |
+
draw.text(
|
| 628 |
+
(w + pad + swatch_w + 8, y_top + 4),
|
| 629 |
+
name,
|
| 630 |
+
fill="black",
|
| 631 |
+
font=font,
|
| 632 |
+
)
|
| 633 |
+
|
| 634 |
+
return np.array(canvas)
|
| 635 |
+
|
| 636 |
+
|
| 637 |
+
# ββ Gradio callbacks ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 638 |
+
|
| 639 |
+
|
| 640 |
+
@spaces.GPU
|
| 641 |
+
def on_variant_change(variant_name):
|
| 642 |
+
load_variant(variant_name)
|
| 643 |
+
_move_models_to_device()
|
| 644 |
+
_ensure_ade20k_embs()
|
| 645 |
+
return (
|
| 646 |
+
None,
|
| 647 |
+
None,
|
| 648 |
+
None, # pca_out, depth_out, kmeans_out
|
| 649 |
+
None, # pca_state
|
| 650 |
+
None,
|
| 651 |
+
None,
|
| 652 |
+
"",
|
| 653 |
+
"", # custom outputs
|
| 654 |
+
)
|
| 655 |
+
|
| 656 |
+
|
| 657 |
+
@spaces.GPU
|
| 658 |
+
def on_pca_extract(image, resolution, _pca_state):
|
| 659 |
+
if image is None:
|
| 660 |
+
return None, None, None, None
|
| 661 |
+
_init_model()
|
| 662 |
+
resolution = int(resolution)
|
| 663 |
+
spatial = extract_features(image, resolution)
|
| 664 |
+
h, w = image.shape[:2]
|
| 665 |
+
pca = vis_pca(spatial)
|
| 666 |
+
depth = vis_depth(spatial)
|
| 667 |
+
kmeans = vis_kmeans(spatial, h, w)
|
| 668 |
+
state = {
|
| 669 |
+
"spatial": spatial,
|
| 670 |
+
"orig_image": image,
|
| 671 |
+
"variant": _model["name"],
|
| 672 |
+
"resolution": resolution,
|
| 673 |
+
}
|
| 674 |
+
return pca, depth, kmeans, state
|
| 675 |
+
|
| 676 |
+
|
| 677 |
+
@spaces.GPU
|
| 678 |
+
def on_recluster(image, resolution, n_clusters, pca_state):
|
| 679 |
+
if image is None:
|
| 680 |
+
gr.Warning("Upload an image first.")
|
| 681 |
+
return None, pca_state
|
| 682 |
+
_init_model()
|
| 683 |
+
resolution = int(resolution)
|
| 684 |
+
if (
|
| 685 |
+
pca_state is not None
|
| 686 |
+
and pca_state.get("variant") == _model["name"]
|
| 687 |
+
and pca_state.get("resolution") == resolution
|
| 688 |
+
):
|
| 689 |
+
spatial = pca_state["spatial"]
|
| 690 |
+
else:
|
| 691 |
+
spatial = extract_features(image, resolution)
|
| 692 |
+
pca_state = {
|
| 693 |
+
"spatial": spatial,
|
| 694 |
+
"orig_image": image,
|
| 695 |
+
"variant": _model["name"],
|
| 696 |
+
"resolution": resolution,
|
| 697 |
+
}
|
| 698 |
+
h, w = image.shape[:2]
|
| 699 |
+
return vis_kmeans(spatial, h, w, int(n_clusters)), pca_state
|
| 700 |
+
|
| 701 |
+
|
| 702 |
+
@spaces.GPU
|
| 703 |
+
def on_zeroseg_custom(image, resolution, class_names_str):
|
| 704 |
+
if image is None or not class_names_str or not class_names_str.strip():
|
| 705 |
+
gr.Warning("Upload an image and enter at least one class name.")
|
| 706 |
+
return None, None, "", ""
|
| 707 |
+
_init_model()
|
| 708 |
+
resolution = int(resolution)
|
| 709 |
+
classes = [c.strip() for c in class_names_str.split(",") if c.strip()]
|
| 710 |
+
if not classes:
|
| 711 |
+
return None, None, "", ""
|
| 712 |
+
|
| 713 |
+
dev = _device()
|
| 714 |
+
all_embs = []
|
| 715 |
+
for template in TCL_PROMPTS:
|
| 716 |
+
prompts = [template.format(c) for c in classes]
|
| 717 |
+
ids, paddings = _model["tokenizer"].tokenize(prompts, max_len=MAX_LEN)
|
| 718 |
+
with torch.no_grad():
|
| 719 |
+
embs = _model["text"](
|
| 720 |
+
torch.from_numpy(ids).to(dev),
|
| 721 |
+
torch.from_numpy(paddings).to(dev),
|
| 722 |
+
)
|
| 723 |
+
all_embs.append(embs.cpu().numpy())
|
| 724 |
+
class_embs = l2_normalize(np.mean(all_embs, axis=0))
|
| 725 |
+
|
| 726 |
+
spatial = extract_features_value_attention(image, resolution)
|
| 727 |
+
overlay, mask, detected, undetected = vis_custom_semseg(
|
| 728 |
+
spatial,
|
| 729 |
+
image,
|
| 730 |
+
classes,
|
| 731 |
+
class_embs,
|
| 732 |
+
)
|
| 733 |
+
return overlay, mask, detected, undetected
|
| 734 |
+
|
| 735 |
+
|
| 736 |
+
@spaces.GPU
|
| 737 |
+
def on_depth_normals_predict(image, dpt_variant, resolution): # noqa: ARG001
|
| 738 |
+
"""Run DPT depth and normals prediction."""
|
| 739 |
+
if image is None:
|
| 740 |
+
return None, None
|
| 741 |
+
_init_model()
|
| 742 |
+
dev = _device()
|
| 743 |
+
|
| 744 |
+
h, w = image.shape[:2]
|
| 745 |
+
img = Image.fromarray(image).convert("RGB")
|
| 746 |
+
tensor = preprocess(img, int(resolution)).unsqueeze(0).to(dev)
|
| 747 |
+
|
| 748 |
+
depth_map = _model["dpt"].predict_depth(tensor)
|
| 749 |
+
normals_map = _model["dpt"].predict_normals(tensor)
|
| 750 |
+
|
| 751 |
+
return (
|
| 752 |
+
vis_depth_dpt(depth_map[0, 0].cpu().numpy(), h, w),
|
| 753 |
+
vis_normals_dpt(normals_map[0], h, w),
|
| 754 |
+
)
|
| 755 |
+
|
| 756 |
+
|
| 757 |
+
@spaces.GPU
|
| 758 |
+
def on_segmentation_predict(image, dpt_variant, resolution): # noqa: ARG001
|
| 759 |
+
"""Run DPT segmentation prediction."""
|
| 760 |
+
if image is None:
|
| 761 |
+
return None
|
| 762 |
+
_init_model()
|
| 763 |
+
dev = _device()
|
| 764 |
+
|
| 765 |
+
img = Image.fromarray(image).convert("RGB")
|
| 766 |
+
tensor = preprocess(img, int(resolution)).unsqueeze(0).to(dev)
|
| 767 |
+
|
| 768 |
+
seg_map = _model["dpt"].predict_segmentation(tensor)
|
| 769 |
+
return vis_segmentation_dpt(seg_map[0], image)
|
| 770 |
+
|
| 771 |
+
|
| 772 |
+
# ββ UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 773 |
+
|
| 774 |
+
custom_css = """
|
| 775 |
+
#pca_output_image img, #depth_output_image img {
|
| 776 |
+
image-rendering: pixelated;
|
| 777 |
+
object-fit: contain;
|
| 778 |
+
}
|
| 779 |
+
"""
|
| 780 |
+
|
| 781 |
+
head = """
|
| 782 |
+
<!-- Google tag (gtag.js) -->
|
| 783 |
+
<script async src="https://www.googletagmanager.com/gtag/js?id=G-P13E18K71N"></script>
|
| 784 |
+
<script>
|
| 785 |
+
window.dataLayer = window.dataLayer || [];
|
| 786 |
+
function gtag(){dataLayer.push(arguments);}
|
| 787 |
+
gtag('js', new Date());
|
| 788 |
+
|
| 789 |
+
gtag('config', 'G-P13E18K71N');
|
| 790 |
+
</script>
|
| 791 |
+
"""
|
| 792 |
+
|
| 793 |
+
with gr.Blocks(head=head, title="TIPSv2 Feature Explorer", css=custom_css) as demo:
|
| 794 |
+
gr.Markdown(
|
| 795 |
+
"## TIPSv2 Feature Explorer\n"
|
| 796 |
+
"Explore TIPSv2 representations here! For more information, see: "
|
| 797 |
+
"https://gdm-tipsv2.github.io/",
|
| 798 |
+
)
|
| 799 |
+
|
| 800 |
+
with gr.Row():
|
| 801 |
+
variant_dd = gr.Dropdown(
|
| 802 |
+
choices=list(VARIANTS.keys()),
|
| 803 |
+
value=DEFAULT_VARIANT,
|
| 804 |
+
label="Model variant",
|
| 805 |
+
)
|
| 806 |
+
resolution_dd = gr.Dropdown(
|
| 807 |
+
choices=RESOLUTIONS,
|
| 808 |
+
value=DEFAULT_IMAGE_SIZE,
|
| 809 |
+
label="Resolution (higher = better quality, slower)",
|
| 810 |
+
)
|
| 811 |
+
|
| 812 |
+
# ββ PCA / Feature Visualization Tab βββββββββββββββββββββββββββββββββ
|
| 813 |
+
with gr.Tab("π¨ PCA & Feature Visualization"):
|
| 814 |
+
pca_state = gr.State(None)
|
| 815 |
+
|
| 816 |
+
with gr.Row():
|
| 817 |
+
with gr.Column():
|
| 818 |
+
pca_input = gr.Image(type="numpy", label="Input image")
|
| 819 |
+
pca_btn = gr.Button("Extract Features", variant="primary")
|
| 820 |
+
|
| 821 |
+
with gr.Column():
|
| 822 |
+
with gr.Tabs():
|
| 823 |
+
with gr.Tab("PCA"):
|
| 824 |
+
pca_out = gr.Image(
|
| 825 |
+
label="PCA (3 components β RGB)",
|
| 826 |
+
height=448,
|
| 827 |
+
elem_id="pca_output_image",
|
| 828 |
+
)
|
| 829 |
+
with gr.Tab("PCA (1st component)"):
|
| 830 |
+
depth_out = gr.Image(
|
| 831 |
+
label="1st PCA component",
|
| 832 |
+
height=448,
|
| 833 |
+
elem_id="depth_output_image",
|
| 834 |
+
)
|
| 835 |
+
with gr.Tab("K-means Clustering"):
|
| 836 |
+
n_clusters = gr.Slider(
|
| 837 |
+
2,
|
| 838 |
+
20,
|
| 839 |
+
value=6,
|
| 840 |
+
step=1,
|
| 841 |
+
label="Clusters",
|
| 842 |
+
)
|
| 843 |
+
recluster_btn = gr.Button("Re-cluster")
|
| 844 |
+
kmeans_out = gr.Image(label="K-means clusters")
|
| 845 |
+
|
| 846 |
+
gr.Markdown("π **Click the examples below to explore!**")
|
| 847 |
+
gr.Examples(
|
| 848 |
+
examples=[
|
| 849 |
+
["examples/pca/hike.jpeg"],
|
| 850 |
+
["examples/pca/cph.jpeg"],
|
| 851 |
+
["examples/pca/angus.jpeg"],
|
| 852 |
+
["examples/pca/dadaocheng.jpeg"],
|
| 853 |
+
],
|
| 854 |
+
inputs=[pca_input],
|
| 855 |
+
)
|
| 856 |
+
|
| 857 |
+
# ββ Zero-shot Segmentation Tab ββββββββββββββββββββββββββββββββββββββ
|
| 858 |
+
with gr.Tab("βοΈ Zero-shot Segmentation"):
|
| 859 |
+
gr.Markdown(
|
| 860 |
+
"Define your own classes for zero-shot segmentation. "
|
| 861 |
+
"Enter class names separated by commas.",
|
| 862 |
+
)
|
| 863 |
+
|
| 864 |
+
with gr.Row():
|
| 865 |
+
with gr.Column():
|
| 866 |
+
custom_input = gr.Image(type="numpy", label="Input image", height=448)
|
| 867 |
+
custom_classes = gr.Textbox(
|
| 868 |
+
label="Class names (comma-separated)",
|
| 869 |
+
value="class1, class2, class3",
|
| 870 |
+
placeholder="e.g. cat, dog, sky, grass",
|
| 871 |
+
)
|
| 872 |
+
custom_btn = gr.Button("Segment", variant="primary")
|
| 873 |
+
|
| 874 |
+
with gr.Column():
|
| 875 |
+
with gr.Tabs():
|
| 876 |
+
with gr.Tab("Overlay"):
|
| 877 |
+
custom_overlay = gr.Image(
|
| 878 |
+
label="Segmentation overlay",
|
| 879 |
+
height=448,
|
| 880 |
+
)
|
| 881 |
+
with gr.Tab("Mask"):
|
| 882 |
+
custom_mask = gr.Image(
|
| 883 |
+
label="Segmentation mask",
|
| 884 |
+
height=448,
|
| 885 |
+
)
|
| 886 |
+
custom_detected = gr.Textbox(
|
| 887 |
+
label="Detected classes (sorted by area)",
|
| 888 |
+
lines=2,
|
| 889 |
+
)
|
| 890 |
+
custom_undetected = gr.Textbox(label="Not detected", lines=2)
|
| 891 |
+
|
| 892 |
+
gr.Markdown("π **Click the examples below to explore!**")
|
| 893 |
+
gr.Examples(
|
| 894 |
+
examples=[
|
| 895 |
+
["examples/zeroseg/voc_2008_000891.jpg", "dog, cage, cloth, dog bowl"],
|
| 896 |
+
[
|
| 897 |
+
"examples/zeroseg/pascal_context_00000_image.png",
|
| 898 |
+
"bike, tree, fence, soccer, floor, chair, cushion",
|
| 899 |
+
],
|
| 900 |
+
[
|
| 901 |
+
"examples/zeroseg/pascal_context_00007_image.png",
|
| 902 |
+
"dog, table, chair, carpet, shoes",
|
| 903 |
+
],
|
| 904 |
+
[
|
| 905 |
+
"examples/zeroseg/pascal_context_00049_image.png",
|
| 906 |
+
"bus, snow, mountain, house, road",
|
| 907 |
+
],
|
| 908 |
+
],
|
| 909 |
+
inputs=[custom_input, custom_classes],
|
| 910 |
+
)
|
| 911 |
+
|
| 912 |
+
# ββ Depth/Normals Visualization Tab βββββββββββββββββββββββββββββββββ
|
| 913 |
+
with gr.Tab("ποΈ Depth/Normals Visualization"):
|
| 914 |
+
gr.Markdown(
|
| 915 |
+
"Monocular depth and surface normals estimation using a **DPT "
|
| 916 |
+
"(Dense Prediction Transformer)** head on top of a **frozen** "
|
| 917 |
+
"TIPS v2 vision encoder. Trained on the **NYU Depth V2** dataset.",
|
| 918 |
+
)
|
| 919 |
+
|
| 920 |
+
with gr.Row():
|
| 921 |
+
with gr.Column():
|
| 922 |
+
depth_input = gr.Image(type="numpy", label="Input image", height=448)
|
| 923 |
+
depth_btn = gr.Button("Predict Depth & Normals", variant="primary")
|
| 924 |
+
|
| 925 |
+
with gr.Column():
|
| 926 |
+
dpt_depth_out = gr.Image(label="DPT Depth Map", height=448)
|
| 927 |
+
|
| 928 |
+
with gr.Column():
|
| 929 |
+
dpt_normals_out = gr.Image(
|
| 930 |
+
label="DPT Surface Normals",
|
| 931 |
+
height=448,
|
| 932 |
+
)
|
| 933 |
+
|
| 934 |
+
gr.Markdown("π **Click the examples below to explore!**")
|
| 935 |
+
gr.Examples(
|
| 936 |
+
examples=[
|
| 937 |
+
["examples/nyuv2/bedroom_00280.jpg"],
|
| 938 |
+
["examples/nyuv2/kitchen_00249.jpg"],
|
| 939 |
+
["examples/nyuv2/living_room_01260.jpg"],
|
| 940 |
+
["examples/nyuv2/office_kitchen_00413.jpg"],
|
| 941 |
+
["examples/nyuv2/study_room_00272.jpg"],
|
| 942 |
+
],
|
| 943 |
+
inputs=[depth_input],
|
| 944 |
+
)
|
| 945 |
+
|
| 946 |
+
# ββ Supervised Segmentation Tab ββββββββββββββββββββββββββββββββββββββ
|
| 947 |
+
with gr.Tab("π Supervised Segmentation"):
|
| 948 |
+
gr.Markdown(
|
| 949 |
+
"Semantic segmentation using a **DPT (Dense Prediction "
|
| 950 |
+
"Transformer)** head on top of a **frozen** TIPS v2 vision "
|
| 951 |
+
"encoder. Trained on ADE20K (150 classes).",
|
| 952 |
+
)
|
| 953 |
+
|
| 954 |
+
with gr.Row():
|
| 955 |
+
with gr.Column():
|
| 956 |
+
seg_input = gr.Image(type="numpy", label="Input image", height=448)
|
| 957 |
+
seg_btn = gr.Button("Segment", variant="primary")
|
| 958 |
+
|
| 959 |
+
with gr.Column():
|
| 960 |
+
seg_out = gr.Image(label="DPT Segmentation (ADE20K)", height=448)
|
| 961 |
+
|
| 962 |
+
gr.Markdown("π **Click the examples below to explore!**")
|
| 963 |
+
gr.Examples(
|
| 964 |
+
examples=[
|
| 965 |
+
["examples/depth/ade20k_00003.png"],
|
| 966 |
+
["examples/depth/ade20k_00007.png"],
|
| 967 |
+
["examples/depth/ade20k_00014.png"],
|
| 968 |
+
["examples/depth/ade20k_00022.png"],
|
| 969 |
+
],
|
| 970 |
+
inputs=[seg_input],
|
| 971 |
+
)
|
| 972 |
+
|
| 973 |
+
# ββ Wiring ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 974 |
+
|
| 975 |
+
variant_dd.change(
|
| 976 |
+
fn=on_variant_change,
|
| 977 |
+
inputs=[variant_dd],
|
| 978 |
+
outputs=[
|
| 979 |
+
pca_out,
|
| 980 |
+
depth_out,
|
| 981 |
+
kmeans_out,
|
| 982 |
+
pca_state,
|
| 983 |
+
custom_overlay,
|
| 984 |
+
custom_mask,
|
| 985 |
+
custom_detected,
|
| 986 |
+
custom_undetected,
|
| 987 |
+
],
|
| 988 |
+
)
|
| 989 |
+
|
| 990 |
+
pca_btn.click(
|
| 991 |
+
fn=on_pca_extract,
|
| 992 |
+
inputs=[pca_input, resolution_dd, pca_state],
|
| 993 |
+
outputs=[pca_out, depth_out, kmeans_out, pca_state],
|
| 994 |
+
)
|
| 995 |
+
recluster_btn.click(
|
| 996 |
+
fn=on_recluster,
|
| 997 |
+
inputs=[pca_input, resolution_dd, n_clusters, pca_state],
|
| 998 |
+
outputs=[kmeans_out, pca_state],
|
| 999 |
+
)
|
| 1000 |
+
|
| 1001 |
+
depth_btn.click(
|
| 1002 |
+
fn=on_depth_normals_predict,
|
| 1003 |
+
inputs=[depth_input, variant_dd, resolution_dd],
|
| 1004 |
+
outputs=[dpt_depth_out, dpt_normals_out],
|
| 1005 |
+
)
|
| 1006 |
+
|
| 1007 |
+
seg_btn.click(
|
| 1008 |
+
fn=on_segmentation_predict,
|
| 1009 |
+
inputs=[seg_input, variant_dd, resolution_dd],
|
| 1010 |
+
outputs=[seg_out],
|
| 1011 |
+
)
|
| 1012 |
+
|
| 1013 |
+
custom_btn.click(
|
| 1014 |
+
fn=on_zeroseg_custom,
|
| 1015 |
+
inputs=[custom_input, resolution_dd, custom_classes],
|
| 1016 |
+
outputs=[custom_overlay, custom_mask, custom_detected, custom_undetected],
|
| 1017 |
+
)
|
| 1018 |
+
|
| 1019 |
+
if __name__ == "__main__":
|
| 1020 |
+
demo.launch()
|
examples/depth/ade20k_00003.png
ADDED
|
Git LFS Details
|
examples/depth/ade20k_00007.png
ADDED
|
Git LFS Details
|
examples/depth/ade20k_00014.png
ADDED
|
Git LFS Details
|
examples/depth/ade20k_00022.png
ADDED
|
Git LFS Details
|
examples/nyuv2/bedroom_00280.jpg
ADDED
|
Git LFS Details
|
examples/nyuv2/kitchen_00249.jpg
ADDED
|
Git LFS Details
|
examples/nyuv2/living_room_01260.jpg
ADDED
|
Git LFS Details
|
examples/nyuv2/office_kitchen_00413.jpg
ADDED
|
Git LFS Details
|
examples/nyuv2/study_room_00272.jpg
ADDED
|
Git LFS Details
|
examples/pca/angus.jpeg
ADDED
|
Git LFS Details
|
examples/pca/cph.jpeg
ADDED
|
Git LFS Details
|
examples/pca/dadaocheng.jpeg
ADDED
|
Git LFS Details
|
examples/pca/hike.jpeg
ADDED
|
Git LFS Details
|
examples/zeroseg/pascal_context_00000_image.png
ADDED
|
Git LFS Details
|
examples/zeroseg/pascal_context_00007_image.png
ADDED
|
Git LFS Details
|
examples/zeroseg/pascal_context_00049_image.png
ADDED
|
Git LFS Details
|
examples/zeroseg/voc_2008_000891.jpg
ADDED
|
Git LFS Details
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
--extra-index-url https://download.pytorch.org/whl/cpu
|
| 2 |
+
torch
|
| 3 |
+
torchvision
|
| 4 |
+
scikit-learn
|
| 5 |
+
matplotlib
|
| 6 |
+
Pillow
|
| 7 |
+
sentencepiece
|
| 8 |
+
numpy
|
| 9 |
+
fast_pytorch_kmeans
|
| 10 |
+
transformers
|