Initialize jaguar HyperView multigeometry Space
Browse files- .dockerignore +7 -0
- .gitignore +7 -0
- Dockerfile +38 -0
- README.md +107 -5
- assets/.gitkeep +2 -0
- config/model_manifest.json +45 -0
- demo.py +326 -0
- scripts/build_hyperview_demo_assets.py +532 -0
- scripts/publish_hyperview_demo_dataset.py +229 -0
.dockerignore
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
*.pyd
|
| 5 |
+
*.so
|
| 6 |
+
.venv/
|
| 7 |
+
dataset_build/
|
.gitignore
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
assets/*
|
| 2 |
+
!assets/.gitkeep
|
| 3 |
+
dataset_build/
|
| 4 |
+
dataset_build_smoke/
|
| 5 |
+
assets_runtime_smoke/
|
| 6 |
+
__pycache__/
|
| 7 |
+
scripts/__pycache__/
|
Dockerfile
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 4 |
+
build-essential \
|
| 5 |
+
curl \
|
| 6 |
+
git \
|
| 7 |
+
libssl-dev \
|
| 8 |
+
pkg-config \
|
| 9 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
+
|
| 11 |
+
RUN useradd -m -u 1000 user
|
| 12 |
+
USER user
|
| 13 |
+
|
| 14 |
+
ENV HOME=/home/user \
|
| 15 |
+
PATH=/home/user/.local/bin:$PATH \
|
| 16 |
+
HF_HOME=/home/user/.cache/huggingface \
|
| 17 |
+
PYTHONUNBUFFERED=1 \
|
| 18 |
+
PIP_NO_CACHE_DIR=1 \
|
| 19 |
+
HYPERVIEW_DATASETS_DIR=/home/user/app/demo_data/datasets \
|
| 20 |
+
HYPERVIEW_MEDIA_DIR=/home/user/app/demo_data/media
|
| 21 |
+
|
| 22 |
+
WORKDIR $HOME/app
|
| 23 |
+
|
| 24 |
+
RUN pip install --upgrade pip
|
| 25 |
+
|
| 26 |
+
ARG HYPERVIEW_VERSION=0.3.1
|
| 27 |
+
RUN pip install "hyperview[ml]==${HYPERVIEW_VERSION}" "datasets>=3.0.0" "numpy>=1.26.0" "pillow>=10.0.0" \
|
| 28 |
+
&& python -c "import hyperview; print('hyperview', hyperview.__version__)"
|
| 29 |
+
|
| 30 |
+
COPY --chown=user demo.py ./demo.py
|
| 31 |
+
COPY --chown=user assets ./assets
|
| 32 |
+
|
| 33 |
+
EXPOSE 7860
|
| 34 |
+
|
| 35 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=600s --retries=3 \
|
| 36 |
+
CMD curl -f http://localhost:7860/__hyperview__/health || exit 1
|
| 37 |
+
|
| 38 |
+
CMD ["python", "demo.py"]
|
README.md
CHANGED
|
@@ -1,10 +1,112 @@
|
|
| 1 |
---
|
| 2 |
-
title: Jaguar
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
|
|
|
| 7 |
pinned: false
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: "HyperView: Jaguar Embedding Geometry Comparison"
|
| 3 |
+
emoji: 🐆
|
| 4 |
+
colorFrom: green
|
| 5 |
+
colorTo: yellow
|
| 6 |
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
pinned: false
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# HyperView Jaguar Core Claims Demo
|
| 12 |
+
|
| 13 |
+
This Space compares the top core-claims-set families in three geometric panels:
|
| 14 |
+
|
| 15 |
+
1. Euclidean: `triplet:T0:msv3` (seed 43)
|
| 16 |
+
2. Hyperspherical view: `arcface:O0:msv3` (seed 44)
|
| 17 |
+
3. Hyperbolic (Poincare) view: `lorentz:O1:msv3` (seed 44)
|
| 18 |
+
|
| 19 |
+
The app loads train + validation-tagged samples from a resized Hugging Face dataset and injects precomputed embedding assets generated offline on GPU.
|
| 20 |
+
|
| 21 |
+
## Contracts
|
| 22 |
+
|
| 23 |
+
Runtime environment variables:
|
| 24 |
+
|
| 25 |
+
- `HF_DATASET_REPO` (default: `hyper3labs/jaguar-hyperview-demo`)
|
| 26 |
+
- `HF_DATASET_CONFIG` (default: `default`)
|
| 27 |
+
- `HF_DATASET_SPLIT` (default: `train`)
|
| 28 |
+
- `EMBEDDING_ASSET_DIR` (default: `./assets`)
|
| 29 |
+
- `EMBEDDING_ASSET_MANIFEST` (default: `${EMBEDDING_ASSET_DIR}/manifest.json`)
|
| 30 |
+
- `HYPERVIEW_DEFAULT_PANEL` (default: `spherical3d`; enables Sphere 3D as initial scatter panel)
|
| 31 |
+
- `HYPERVIEW_LAYOUT_CACHE_VERSION` (default: `v6`; bumps dock layout localStorage key to invalidate stale cached panel state)
|
| 32 |
+
- `HYPERVIEW_BIND_HOST` (preferred bind host; optional)
|
| 33 |
+
- `SPACE_HOST` (compat input only; used for bind only if local: `0.0.0.0`, `127.0.0.1`, `localhost`, `::`, `::1`)
|
| 34 |
+
- `SPACE_PORT` (primary port source)
|
| 35 |
+
- `PORT` (fallback port source when `SPACE_PORT` is unset)
|
| 36 |
+
|
| 37 |
+
Port precedence: `SPACE_PORT` > `PORT` > `7860`.
|
| 38 |
+
|
| 39 |
+
On Hugging Face Spaces, `SPACE_HOST` may be injected as `<space-subdomain>.hf.space`. That domain must not be used as a local bind socket, so the runtime falls back to `0.0.0.0` unless `HYPERVIEW_BIND_HOST` is explicitly set.
|
| 40 |
+
|
| 41 |
+
The runtime also patches HyperView's dock-layout cache key from legacy `hyperview:dockview-layout:v5` to `hyperview:dockview-layout:${HYPERVIEW_LAYOUT_CACHE_VERSION}` to force migration away from stale panel layouts after UI/layout changes. For future migrations, increment `HYPERVIEW_LAYOUT_CACHE_VERSION` (for example, `v7`) without changing code.
|
| 42 |
+
|
| 43 |
+
## Important Note
|
| 44 |
+
|
| 45 |
+
HyperView similarity search currently uses cosine distance in storage backends. The Lorentz panel in this Space is intended for embedding-space visualization and geometry-aware comparison rather than canonical Lorentz-distance retrieval scoring.
|
| 46 |
+
|
| 47 |
+
## Reproducibility Commands
|
| 48 |
+
|
| 49 |
+
Run from this folder (`HyperViewDemoHuggingFaceSpace/`).
|
| 50 |
+
|
| 51 |
+
### 1) Build embedding assets (GPU required)
|
| 52 |
+
|
| 53 |
+
```bash
|
| 54 |
+
source .venv/bin/activate
|
| 55 |
+
python3 scripts/build_hyperview_demo_assets.py \
|
| 56 |
+
--model_manifest config/model_manifest.json \
|
| 57 |
+
--dataset_root ../kaggle_jaguar_dataset_v2 \
|
| 58 |
+
--coreset_csv ../data/validation_coreset.csv \
|
| 59 |
+
--output_dir ./assets \
|
| 60 |
+
--device cuda \
|
| 61 |
+
--batch_size 64 \
|
| 62 |
+
--num_workers 4
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
### 2) Publish resized demo dataset
|
| 66 |
+
|
| 67 |
+
```bash
|
| 68 |
+
source .venv/bin/activate
|
| 69 |
+
python3 scripts/publish_hyperview_demo_dataset.py \
|
| 70 |
+
--dataset_root ../kaggle_jaguar_dataset_v2 \
|
| 71 |
+
--coreset_csv ../data/validation_coreset.csv \
|
| 72 |
+
--output_dir ./dataset_build \
|
| 73 |
+
--repo_id hyper3labs/jaguar-hyperview-demo \
|
| 74 |
+
--config_name default
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
Use `--no_push` for local dry-runs.
|
| 78 |
+
|
| 79 |
+
### 3) Local Docker smoke run
|
| 80 |
+
|
| 81 |
+
```bash
|
| 82 |
+
docker build -t jaguar-hyperview .
|
| 83 |
+
docker run --rm -p 7860:7860 \
|
| 84 |
+
-e HF_DATASET_REPO=hyper3labs/jaguar-hyperview-demo \
|
| 85 |
+
-e EMBEDDING_ASSET_DIR=/home/user/app/assets \
|
| 86 |
+
jaguar-hyperview
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
Open `http://127.0.0.1:7860`.
|
| 90 |
+
|
| 91 |
+
### 4) Optional H100 batch export on HPI
|
| 92 |
+
|
| 93 |
+
```bash
|
| 94 |
+
sbatch remote_setup/build_hyperview_demo_assets_h100.slurm
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
Override defaults at submit time if needed:
|
| 98 |
+
|
| 99 |
+
```bash
|
| 100 |
+
MODEL_MANIFEST=config/model_manifest.json \
|
| 101 |
+
OUTPUT_DIR=./assets \
|
| 102 |
+
sbatch remote_setup/build_hyperview_demo_assets_h100.slurm
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
## Provenance
|
| 106 |
+
|
| 107 |
+
Model manifest: `config/model_manifest.json`
|
| 108 |
+
|
| 109 |
+
Ranking and source-of-truth anchors:
|
| 110 |
+
|
| 111 |
+
- `reports/summaries_of_findings/core_claims_axis12_paper_facing_tables_2026_03_16_102311/axis1_primary_ranking.csv`
|
| 112 |
+
- `paper_draft/second_draft/sources_of_truth.md`
|
assets/.gitkeep
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Generated assets are written here by scripts/build_hyperview_demo_assets.py.
|
| 2 |
+
# Keep this directory in git even when empty.
|
config/model_manifest.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"description": "Core-claims top families for HyperView demo (TEST-best single-seed checkpoints)",
|
| 3 |
+
"source_of_truth": {
|
| 4 |
+
"ranking_csv": "reports/summaries_of_findings/core_claims_axis12_paper_facing_tables_2026_03_16_102311/axis1_primary_ranking.csv",
|
| 5 |
+
"registry": "paper_draft/second_draft/sources_of_truth.md"
|
| 6 |
+
},
|
| 7 |
+
"models": [
|
| 8 |
+
{
|
| 9 |
+
"model_key": "triplet_t0_msv3_seed43",
|
| 10 |
+
"comparison_key": "triplet:T0:msv3",
|
| 11 |
+
"family": "euclidean",
|
| 12 |
+
"loader": "triplet_benchmark",
|
| 13 |
+
"space_key": "triplet_t0_testbest",
|
| 14 |
+
"geometry": "euclidean",
|
| 15 |
+
"layout": "euclidean:2d",
|
| 16 |
+
"checkpoint_name": "triplet_miewid_msv3_T0_seed43_best.pth",
|
| 17 |
+
"checkpoint_path": "checkpoints/triplet_miewid_msv3_T0_seed43_best.pth",
|
| 18 |
+
"notes": "TEST-best seed in selected plan"
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"model_key": "arcface_o0_msv3_seed44",
|
| 22 |
+
"comparison_key": "arcface:O0:msv3",
|
| 23 |
+
"family": "hyperspherical",
|
| 24 |
+
"loader": "arcface_benchmark",
|
| 25 |
+
"space_key": "arcface_o0_testbest",
|
| 26 |
+
"geometry": "euclidean",
|
| 27 |
+
"layout": "spherical:3d",
|
| 28 |
+
"checkpoint_name": "arcface_miewid_msv3_O0_seed44_best.pth",
|
| 29 |
+
"checkpoint_path": "checkpoints/arcface_miewid_msv3_O0_seed44_best.pth",
|
| 30 |
+
"notes": "TEST-best seed in selected plan"
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"model_key": "lorentz_o1_msv3_seed44",
|
| 34 |
+
"comparison_key": "lorentz:O1:msv3",
|
| 35 |
+
"family": "hyperbolic_lorentz",
|
| 36 |
+
"loader": "lorentz",
|
| 37 |
+
"space_key": "lorentz_o1_testbest",
|
| 38 |
+
"geometry": "hyperboloid",
|
| 39 |
+
"layout": "poincare:2d",
|
| 40 |
+
"checkpoint_name": "lorentz_miewid_msv3_O1_seed44_best.pth",
|
| 41 |
+
"checkpoint_path": "checkpoints/lorentz_miewid_msv3_O1_seed44_best.pth",
|
| 42 |
+
"notes": "TEST-best seed in selected plan"
|
| 43 |
+
}
|
| 44 |
+
]
|
| 45 |
+
}
|
demo.py
ADDED
|
@@ -0,0 +1,326 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""HyperView Space runtime for core-claims top jaguar ReID models."""
|
| 3 |
+
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
import re
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Any
|
| 11 |
+
|
| 12 |
+
import numpy as np
|
| 13 |
+
from datasets import Dataset as HFDataset
|
| 14 |
+
from datasets import DatasetDict as HFDatasetDict
|
| 15 |
+
from datasets import load_dataset, load_from_disk
|
| 16 |
+
import hyperview as hv
|
| 17 |
+
from hyperview.core.sample import Sample
|
| 18 |
+
|
| 19 |
+
SPACE_HOST = os.environ.get("SPACE_HOST", "0.0.0.0")
|
| 20 |
+
LOCAL_BIND_HOSTS = {"0.0.0.0", "127.0.0.1", "localhost", "::", "::1"}
|
| 21 |
+
|
| 22 |
+
DATASET_NAME = os.environ.get("HYPERVIEW_DATASET_NAME", "jaguar_core_claims_demo")
|
| 23 |
+
HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "hyper3labs/jaguar-hyperview-demo")
|
| 24 |
+
HF_DATASET_CONFIG = os.environ.get("HF_DATASET_CONFIG", "default")
|
| 25 |
+
HF_DATASET_SPLIT = os.environ.get("HF_DATASET_SPLIT", "train")
|
| 26 |
+
|
| 27 |
+
EMBEDDING_ASSET_DIR = Path(
|
| 28 |
+
os.environ.get(
|
| 29 |
+
"EMBEDDING_ASSET_DIR",
|
| 30 |
+
str((Path(__file__).resolve().parent / "assets").resolve()),
|
| 31 |
+
)
|
| 32 |
+
)
|
| 33 |
+
ASSET_MANIFEST_PATH = Path(
|
| 34 |
+
os.environ.get("EMBEDDING_ASSET_MANIFEST", str((EMBEDDING_ASSET_DIR / "manifest.json").resolve()))
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def _patch_hyperview_default_panel() -> None:
|
| 39 |
+
"""Patch HyperView 0.3.1 frontend for default panel and dock cache-key migration.
|
| 40 |
+
|
| 41 |
+
HyperView currently has no public API for these behaviors. This runtime patch is
|
| 42 |
+
intentionally narrow and idempotent, targeting the known bundled chunk for v0.3.1.
|
| 43 |
+
"""
|
| 44 |
+
default_panel = os.environ.get("HYPERVIEW_DEFAULT_PANEL", "spherical3d").strip().lower()
|
| 45 |
+
apply_default_panel_patch = default_panel in {"spherical3d", "sphere3d"}
|
| 46 |
+
if not apply_default_panel_patch:
|
| 47 |
+
print(f"Skipping frontend default-panel patch (HYPERVIEW_DEFAULT_PANEL={default_panel!r}).")
|
| 48 |
+
|
| 49 |
+
cache_version = os.environ.get("HYPERVIEW_LAYOUT_CACHE_VERSION", "v6").strip() or "v6"
|
| 50 |
+
target_layout_key = f"hyperview:dockview-layout:{cache_version}"
|
| 51 |
+
legacy_layout_key = "hyperview:dockview-layout:v5"
|
| 52 |
+
layout_key_pattern = r"hyperview:dockview-layout:v\d+"
|
| 53 |
+
|
| 54 |
+
chunk_path = (
|
| 55 |
+
Path(hv.__file__).resolve().parent
|
| 56 |
+
/ "server"
|
| 57 |
+
/ "static"
|
| 58 |
+
/ "_next"
|
| 59 |
+
/ "static"
|
| 60 |
+
/ "chunks"
|
| 61 |
+
/ "077b38561d6ea80d.js"
|
| 62 |
+
)
|
| 63 |
+
if not chunk_path.exists():
|
| 64 |
+
print(f"Default-panel patch skipped: chunk not found at {chunk_path}")
|
| 65 |
+
return
|
| 66 |
+
|
| 67 |
+
marker_before = 'v||(v=n)};if(f&&l&&w({id:dr,title:"Euclidean"'
|
| 68 |
+
marker_after = 'v||(v=n),t.id===dd&&n.api.setActive()};if(f&&l&&w({id:dr,title:"Euclidean"'
|
| 69 |
+
|
| 70 |
+
try:
|
| 71 |
+
payload = chunk_path.read_text(encoding="utf-8")
|
| 72 |
+
except OSError as exc:
|
| 73 |
+
print(f"Default-panel patch skipped: failed reading chunk ({exc})")
|
| 74 |
+
return
|
| 75 |
+
|
| 76 |
+
patched = payload
|
| 77 |
+
changed = False
|
| 78 |
+
|
| 79 |
+
if apply_default_panel_patch:
|
| 80 |
+
if marker_after in patched:
|
| 81 |
+
print("HyperView frontend already patched for Sphere 3D default panel.")
|
| 82 |
+
elif marker_before in patched:
|
| 83 |
+
patched = patched.replace(marker_before, marker_after, 1)
|
| 84 |
+
changed = True
|
| 85 |
+
print("Patched HyperView frontend: Sphere 3D will open as default scatter panel.")
|
| 86 |
+
else:
|
| 87 |
+
print("Default-panel patch skipped: expected marker not found in HyperView chunk.")
|
| 88 |
+
|
| 89 |
+
if target_layout_key in patched:
|
| 90 |
+
print(f"HyperView frontend already uses dock cache key '{target_layout_key}'.")
|
| 91 |
+
elif legacy_layout_key in patched:
|
| 92 |
+
patched = patched.replace(legacy_layout_key, target_layout_key, 1)
|
| 93 |
+
changed = True
|
| 94 |
+
print(f"Patched HyperView frontend: dock cache key {legacy_layout_key} -> {target_layout_key}.")
|
| 95 |
+
else:
|
| 96 |
+
discovered = re.search(layout_key_pattern, patched)
|
| 97 |
+
if discovered:
|
| 98 |
+
source_key = discovered.group(0)
|
| 99 |
+
if source_key == target_layout_key:
|
| 100 |
+
print(f"HyperView frontend already uses dock cache key '{target_layout_key}'.")
|
| 101 |
+
else:
|
| 102 |
+
print(
|
| 103 |
+
f"Dock cache patch notice: expected legacy key '{legacy_layout_key}' not found; "
|
| 104 |
+
f"migrating detected key '{source_key}' -> '{target_layout_key}'."
|
| 105 |
+
)
|
| 106 |
+
patched = patched.replace(source_key, target_layout_key, 1)
|
| 107 |
+
changed = True
|
| 108 |
+
else:
|
| 109 |
+
print(
|
| 110 |
+
"Dock cache patch warning: expected layout cache key marker "
|
| 111 |
+
f"'{legacy_layout_key}' not found in HyperView chunk."
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
if not changed:
|
| 115 |
+
return
|
| 116 |
+
|
| 117 |
+
try:
|
| 118 |
+
chunk_path.write_text(patched, encoding="utf-8")
|
| 119 |
+
except OSError as exc:
|
| 120 |
+
print(f"Frontend patch skipped: failed writing chunk ({exc})")
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def _resolve_bind_host() -> tuple[str, str | None]:
|
| 124 |
+
explicit_bind = os.environ.get("HYPERVIEW_BIND_HOST")
|
| 125 |
+
if explicit_bind:
|
| 126 |
+
return explicit_bind, None
|
| 127 |
+
|
| 128 |
+
if SPACE_HOST in LOCAL_BIND_HOSTS:
|
| 129 |
+
return SPACE_HOST, None
|
| 130 |
+
|
| 131 |
+
return "0.0.0.0", f"SPACE_HOST='{SPACE_HOST}' is non-local; falling back to 0.0.0.0"
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def _resolve_port() -> int:
|
| 135 |
+
for key in ("SPACE_PORT", "PORT"):
|
| 136 |
+
value = os.environ.get(key)
|
| 137 |
+
if value:
|
| 138 |
+
try:
|
| 139 |
+
return int(value)
|
| 140 |
+
except ValueError as exc:
|
| 141 |
+
raise ValueError(f"Invalid integer value for {key}: {value}") from exc
|
| 142 |
+
return 7860
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def load_asset_manifest(path: Path) -> dict[str, Any]:
|
| 146 |
+
if not path.exists():
|
| 147 |
+
raise FileNotFoundError(
|
| 148 |
+
f"Embedding asset manifest not found: {path}. "
|
| 149 |
+
"Run scripts/build_hyperview_demo_assets.py first."
|
| 150 |
+
)
|
| 151 |
+
payload = json.loads(path.read_text(encoding="utf-8"))
|
| 152 |
+
if "models" not in payload or not isinstance(payload["models"], list):
|
| 153 |
+
raise ValueError(f"Invalid asset manifest format: {path}")
|
| 154 |
+
return payload
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def _load_hf_rows() -> HFDataset:
|
| 158 |
+
repo_path = Path(HF_DATASET_REPO)
|
| 159 |
+
if repo_path.exists():
|
| 160 |
+
loaded = load_from_disk(str(repo_path))
|
| 161 |
+
if isinstance(loaded, HFDatasetDict):
|
| 162 |
+
if HF_DATASET_SPLIT in loaded:
|
| 163 |
+
return loaded[HF_DATASET_SPLIT]
|
| 164 |
+
if "train" in loaded:
|
| 165 |
+
return loaded["train"]
|
| 166 |
+
first_split = next(iter(loaded.keys()))
|
| 167 |
+
return loaded[first_split]
|
| 168 |
+
return loaded
|
| 169 |
+
return load_dataset(HF_DATASET_REPO, name=HF_DATASET_CONFIG, split=HF_DATASET_SPLIT)
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def ingest_hf_dataset_samples(dataset: hv.Dataset) -> None:
|
| 173 |
+
rows = _load_hf_rows()
|
| 174 |
+
media_root = Path(os.environ.get("HYPERVIEW_MEDIA_DIR", "./demo_data/media")) / DATASET_NAME
|
| 175 |
+
media_root.mkdir(parents=True, exist_ok=True)
|
| 176 |
+
|
| 177 |
+
added = 0
|
| 178 |
+
for index, row in enumerate(rows):
|
| 179 |
+
filename = str(row.get("filename", f"sample_{index:06d}.jpg"))
|
| 180 |
+
sample_id = str(row.get("sample_id", filename))
|
| 181 |
+
if dataset._storage.get_sample(sample_id) is not None:
|
| 182 |
+
continue
|
| 183 |
+
|
| 184 |
+
image_obj = row["image"]
|
| 185 |
+
image_path = media_root / f"{Path(sample_id).stem}.jpg"
|
| 186 |
+
if not image_path.exists():
|
| 187 |
+
image_obj.convert("RGB").save(image_path, format="JPEG", quality=90, optimize=True)
|
| 188 |
+
|
| 189 |
+
label = str(row.get("label", ""))
|
| 190 |
+
metadata = {
|
| 191 |
+
"filename": filename,
|
| 192 |
+
"sample_id": sample_id,
|
| 193 |
+
"split_tag": str(row.get("split_tag", "unknown")),
|
| 194 |
+
"identity": label,
|
| 195 |
+
"source_repo": HF_DATASET_REPO,
|
| 196 |
+
"source_config": HF_DATASET_CONFIG,
|
| 197 |
+
"source_split": HF_DATASET_SPLIT,
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
dataset.add_sample(
|
| 201 |
+
Sample(
|
| 202 |
+
id=sample_id,
|
| 203 |
+
filepath=str(image_path),
|
| 204 |
+
label=label,
|
| 205 |
+
metadata=metadata,
|
| 206 |
+
)
|
| 207 |
+
)
|
| 208 |
+
added += 1
|
| 209 |
+
|
| 210 |
+
print(f"Ingested {added} HF samples into HyperView dataset '{DATASET_NAME}'.")
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
def ensure_embedding_spaces(dataset: hv.Dataset, asset_manifest: dict[str, Any], asset_dir: Path) -> None:
|
| 214 |
+
known_sample_ids = {sample.id for sample in dataset.samples}
|
| 215 |
+
|
| 216 |
+
for model in asset_manifest["models"]:
|
| 217 |
+
model_key = str(model["model_key"])
|
| 218 |
+
space_key = str(model["space_key"])
|
| 219 |
+
embeddings_rel = model.get("embeddings_path")
|
| 220 |
+
if not embeddings_rel:
|
| 221 |
+
raise ValueError(f"Missing embeddings_path in asset manifest for model {model_key}")
|
| 222 |
+
|
| 223 |
+
embeddings_path = asset_dir / str(embeddings_rel)
|
| 224 |
+
if not embeddings_path.exists():
|
| 225 |
+
raise FileNotFoundError(
|
| 226 |
+
f"Missing embeddings file for model {model_key}: {embeddings_path}"
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
payload = np.load(embeddings_path, allow_pickle=False)
|
| 230 |
+
ids = [str(x) for x in payload["ids"].tolist()]
|
| 231 |
+
vectors = np.asarray(payload["vectors"], dtype=np.float32)
|
| 232 |
+
|
| 233 |
+
if vectors.ndim != 2:
|
| 234 |
+
raise ValueError(f"Embeddings for {model_key} must be 2D; got {vectors.shape}")
|
| 235 |
+
if len(ids) != vectors.shape[0]:
|
| 236 |
+
raise ValueError(
|
| 237 |
+
f"Embeddings/ID mismatch for {model_key}: {len(ids)} ids vs {vectors.shape[0]} vectors"
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
missing_ids = sorted(set(ids) - known_sample_ids)
|
| 241 |
+
if missing_ids:
|
| 242 |
+
preview = ", ".join(missing_ids[:5])
|
| 243 |
+
raise RuntimeError(
|
| 244 |
+
f"Embedding IDs missing from loaded dataset for {model_key}. "
|
| 245 |
+
f"First missing IDs: {preview}"
|
| 246 |
+
)
|
| 247 |
+
|
| 248 |
+
config = {
|
| 249 |
+
"provider": "precomputed-checkpoint",
|
| 250 |
+
"geometry": str(model["geometry"]),
|
| 251 |
+
"comparison_key": model.get("comparison_key"),
|
| 252 |
+
"family": model.get("family"),
|
| 253 |
+
"checkpoint_path": model.get("checkpoint_path"),
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
dataset._storage.ensure_space(
|
| 257 |
+
model_id=model_key,
|
| 258 |
+
dim=int(vectors.shape[1]),
|
| 259 |
+
space_key=space_key,
|
| 260 |
+
config=config,
|
| 261 |
+
)
|
| 262 |
+
dataset._storage.add_embeddings(space_key, ids, vectors)
|
| 263 |
+
|
| 264 |
+
print(f"Ensured space {space_key} ({vectors.shape[0]} x {vectors.shape[1]})")
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def ensure_layouts(dataset: hv.Dataset, asset_manifest: dict[str, Any]) -> list[str]:
|
| 268 |
+
layout_keys: list[str] = []
|
| 269 |
+
for model in asset_manifest["models"]:
|
| 270 |
+
space_key = str(model["space_key"])
|
| 271 |
+
layout_spec = str(model.get("layout", "euclidean:2d"))
|
| 272 |
+
layout_key = dataset.compute_visualization(
|
| 273 |
+
space_key=space_key,
|
| 274 |
+
layout=layout_spec,
|
| 275 |
+
method="umap",
|
| 276 |
+
force=False,
|
| 277 |
+
)
|
| 278 |
+
layout_keys.append(layout_key)
|
| 279 |
+
print(f"Ensured layout {layout_key} for space={space_key}")
|
| 280 |
+
return layout_keys
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
def build_dataset() -> hv.Dataset:
|
| 284 |
+
asset_manifest = load_asset_manifest(ASSET_MANIFEST_PATH)
|
| 285 |
+
|
| 286 |
+
dataset = hv.Dataset(DATASET_NAME)
|
| 287 |
+
if len(dataset) == 0:
|
| 288 |
+
print(
|
| 289 |
+
f"Loading HF dataset rows from {HF_DATASET_REPO}[{HF_DATASET_CONFIG}] split={HF_DATASET_SPLIT}"
|
| 290 |
+
)
|
| 291 |
+
ingest_hf_dataset_samples(dataset)
|
| 292 |
+
|
| 293 |
+
ensure_embedding_spaces(dataset, asset_manifest=asset_manifest, asset_dir=EMBEDDING_ASSET_DIR)
|
| 294 |
+
layout_keys = ensure_layouts(dataset, asset_manifest=asset_manifest)
|
| 295 |
+
|
| 296 |
+
print(f"Dataset '{DATASET_NAME}' has {len(dataset)} samples")
|
| 297 |
+
print(f"Spaces: {[space.space_key for space in dataset.list_spaces()]}")
|
| 298 |
+
print(f"Layouts: {layout_keys}")
|
| 299 |
+
|
| 300 |
+
return dataset
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
def main() -> None:
|
| 304 |
+
_patch_hyperview_default_panel()
|
| 305 |
+
dataset = build_dataset()
|
| 306 |
+
|
| 307 |
+
if os.environ.get("HYPERVIEW_DEMO_PREP_ONLY") == "1":
|
| 308 |
+
print("Preparation-only mode enabled; skipping server launch.")
|
| 309 |
+
return
|
| 310 |
+
|
| 311 |
+
bind_host, bind_warning = _resolve_bind_host()
|
| 312 |
+
bind_port = _resolve_port()
|
| 313 |
+
|
| 314 |
+
if bind_warning:
|
| 315 |
+
print(f"Bind host notice: {bind_warning}")
|
| 316 |
+
print(
|
| 317 |
+
"Starting HyperView with "
|
| 318 |
+
f"bind_host={bind_host} bind_port={bind_port} "
|
| 319 |
+
f"(SPACE_HOST={SPACE_HOST!r}, SPACE_PORT={os.environ.get('SPACE_PORT')!r}, "
|
| 320 |
+
f"PORT={os.environ.get('PORT')!r})"
|
| 321 |
+
)
|
| 322 |
+
hv.launch(dataset, host=bind_host, port=bind_port, open_browser=False)
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
if __name__ == "__main__":
|
| 326 |
+
main()
|
scripts/build_hyperview_demo_assets.py
ADDED
|
@@ -0,0 +1,532 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Build precomputed HyperView embedding assets for the jaguar Space."""
|
| 3 |
+
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import argparse
|
| 7 |
+
import json
|
| 8 |
+
from dataclasses import dataclass
|
| 9 |
+
from datetime import datetime, timezone
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
import sys
|
| 12 |
+
from typing import Any
|
| 13 |
+
from urllib.parse import urlparse
|
| 14 |
+
|
| 15 |
+
import numpy as np
|
| 16 |
+
import pandas as pd
|
| 17 |
+
import torch
|
| 18 |
+
from PIL import Image
|
| 19 |
+
from torch.utils.data import DataLoader, Dataset
|
| 20 |
+
from tqdm import tqdm
|
| 21 |
+
|
| 22 |
+
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
| 23 |
+
if str(PROJECT_ROOT) not in sys.path:
|
| 24 |
+
sys.path.append(str(PROJECT_ROOT))
|
| 25 |
+
|
| 26 |
+
from experiment_scripts.evaluate_inpainted_bgfg import ( # noqa: E402
|
| 27 |
+
_load_arcface_benchmark,
|
| 28 |
+
_load_lorentz,
|
| 29 |
+
_load_triplet_benchmark,
|
| 30 |
+
)
|
| 31 |
+
from experiment_scripts.train_lorentz_reid import build_transforms # noqa: E402
|
| 32 |
+
|
| 33 |
+
DEFAULT_MANIFEST_PATH = PROJECT_ROOT / "HyperViewDemoHuggingFaceSpace/config/model_manifest.json"
|
| 34 |
+
DEFAULT_DATASET_ROOT = PROJECT_ROOT / "kaggle_jaguar_dataset_v2"
|
| 35 |
+
DEFAULT_CORESET_CSV = PROJECT_ROOT / "data/validation_coreset.csv"
|
| 36 |
+
DEFAULT_OUTPUT_DIR = PROJECT_ROOT / "HyperViewDemoHuggingFaceSpace/assets"
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
@dataclass
|
| 40 |
+
class LoadedModel:
|
| 41 |
+
model: Any
|
| 42 |
+
val_transform: Any
|
| 43 |
+
image_size: int
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class JaguarEmbeddingDataset(Dataset):
|
| 47 |
+
def __init__(
|
| 48 |
+
self,
|
| 49 |
+
rows: list[dict[str, str]],
|
| 50 |
+
images_dir: Path,
|
| 51 |
+
transform: Any,
|
| 52 |
+
image_variant: str,
|
| 53 |
+
):
|
| 54 |
+
self.rows = rows
|
| 55 |
+
self.images_dir = images_dir
|
| 56 |
+
self.transform = transform
|
| 57 |
+
self.image_variant = image_variant
|
| 58 |
+
|
| 59 |
+
def __len__(self) -> int:
|
| 60 |
+
return len(self.rows)
|
| 61 |
+
|
| 62 |
+
@staticmethod
|
| 63 |
+
def _is_albumentations_transform(transform: Any) -> bool:
|
| 64 |
+
return transform.__class__.__module__.startswith("albumentations")
|
| 65 |
+
|
| 66 |
+
def _load_image(self, filename: str) -> Image.Image:
|
| 67 |
+
image_path = self.images_dir / filename
|
| 68 |
+
if self.image_variant == "foreground_only":
|
| 69 |
+
rgba = Image.open(image_path).convert("RGBA")
|
| 70 |
+
rgba_np = np.array(rgba, dtype=np.uint8)
|
| 71 |
+
rgb = rgba_np[:, :, :3]
|
| 72 |
+
alpha = rgba_np[:, :, 3]
|
| 73 |
+
mask = (alpha > 0).astype(np.uint8)
|
| 74 |
+
cutout_rgb = (rgb * mask[:, :, np.newaxis]).astype(np.uint8)
|
| 75 |
+
return Image.fromarray(cutout_rgb, mode="RGB")
|
| 76 |
+
return Image.open(image_path).convert("RGB")
|
| 77 |
+
|
| 78 |
+
def __getitem__(self, idx: int):
|
| 79 |
+
row = self.rows[idx]
|
| 80 |
+
image = self._load_image(row["filename"])
|
| 81 |
+
|
| 82 |
+
if self.transform is None:
|
| 83 |
+
raise ValueError("Validation transform is required for embedding extraction.")
|
| 84 |
+
|
| 85 |
+
if self._is_albumentations_transform(self.transform):
|
| 86 |
+
image_tensor = self.transform(image=np.array(image, dtype=np.uint8))["image"]
|
| 87 |
+
else:
|
| 88 |
+
image_tensor = self.transform(image)
|
| 89 |
+
|
| 90 |
+
return (
|
| 91 |
+
image_tensor,
|
| 92 |
+
row["sample_id"],
|
| 93 |
+
row["label"],
|
| 94 |
+
row["filename"],
|
| 95 |
+
row["split_tag"],
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def parse_args() -> argparse.Namespace:
|
| 100 |
+
parser = argparse.ArgumentParser(
|
| 101 |
+
description="Build precomputed embedding artifacts for HyperView Space runtime."
|
| 102 |
+
)
|
| 103 |
+
parser.add_argument(
|
| 104 |
+
"--model_manifest",
|
| 105 |
+
type=Path,
|
| 106 |
+
default=DEFAULT_MANIFEST_PATH,
|
| 107 |
+
help="Model manifest JSON defining the three demo models.",
|
| 108 |
+
)
|
| 109 |
+
parser.add_argument(
|
| 110 |
+
"--dataset_root",
|
| 111 |
+
type=Path,
|
| 112 |
+
default=DEFAULT_DATASET_ROOT,
|
| 113 |
+
help="Dataset root containing train.csv and train/ images.",
|
| 114 |
+
)
|
| 115 |
+
parser.add_argument(
|
| 116 |
+
"--coreset_csv",
|
| 117 |
+
type=Path,
|
| 118 |
+
default=DEFAULT_CORESET_CSV,
|
| 119 |
+
help="Validation coreset CSV used to tag split_tag=train/validation.",
|
| 120 |
+
)
|
| 121 |
+
parser.add_argument(
|
| 122 |
+
"--output_dir",
|
| 123 |
+
type=Path,
|
| 124 |
+
default=DEFAULT_OUTPUT_DIR,
|
| 125 |
+
help="Output directory for per-model embeddings and manifest JSON.",
|
| 126 |
+
)
|
| 127 |
+
parser.add_argument(
|
| 128 |
+
"--device",
|
| 129 |
+
type=str,
|
| 130 |
+
default="cuda",
|
| 131 |
+
choices=["cuda"],
|
| 132 |
+
help="Runtime device. CUDA-only by contract.",
|
| 133 |
+
)
|
| 134 |
+
parser.add_argument("--batch_size", type=int, default=64)
|
| 135 |
+
parser.add_argument("--num_workers", type=int, default=4)
|
| 136 |
+
parser.add_argument(
|
| 137 |
+
"--image_variant",
|
| 138 |
+
type=str,
|
| 139 |
+
default="foreground_only",
|
| 140 |
+
choices=["foreground_only", "full_rgb"],
|
| 141 |
+
)
|
| 142 |
+
parser.add_argument(
|
| 143 |
+
"--max_samples",
|
| 144 |
+
type=int,
|
| 145 |
+
default=None,
|
| 146 |
+
help="Optional smoke-mode sample cap for quick checks.",
|
| 147 |
+
)
|
| 148 |
+
return parser.parse_args()
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def utc_now() -> str:
|
| 152 |
+
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def resolve_device(device_name: str) -> torch.device:
|
| 156 |
+
if device_name != "cuda":
|
| 157 |
+
raise SystemExit("GPU unavailable: CUDA requested but not available.")
|
| 158 |
+
if not torch.cuda.is_available():
|
| 159 |
+
raise SystemExit("GPU unavailable: CUDA requested but not available.")
|
| 160 |
+
return torch.device("cuda")
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def load_model_manifest(manifest_path: Path) -> dict[str, Any]:
|
| 164 |
+
payload = json.loads(manifest_path.read_text(encoding="utf-8"))
|
| 165 |
+
if "models" not in payload or not isinstance(payload["models"], list):
|
| 166 |
+
raise ValueError(f"Invalid model manifest: {manifest_path}")
|
| 167 |
+
return payload
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def parse_run_url(run_url: str) -> tuple[str, str, str]:
|
| 171 |
+
parsed = urlparse(run_url)
|
| 172 |
+
parts = [p for p in parsed.path.split("/") if p]
|
| 173 |
+
if len(parts) >= 4 and parts[2] == "runs":
|
| 174 |
+
return parts[0], parts[1], parts[3]
|
| 175 |
+
raise ValueError(f"Unsupported W&B run URL format: {run_url}")
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def pick_checkpoint_file(root: Path, checkpoint_name: str | None) -> Path:
|
| 179 |
+
if checkpoint_name:
|
| 180 |
+
exact = sorted(root.rglob(checkpoint_name))
|
| 181 |
+
if exact:
|
| 182 |
+
return exact[0]
|
| 183 |
+
|
| 184 |
+
candidates = sorted(root.rglob("*.pth"))
|
| 185 |
+
if not candidates:
|
| 186 |
+
raise FileNotFoundError(f"No .pth checkpoints found under downloaded artifact: {root}")
|
| 187 |
+
return candidates[0]
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def download_checkpoint_from_wandb(
|
| 191 |
+
run_url: str,
|
| 192 |
+
model_key: str,
|
| 193 |
+
checkpoint_name: str | None,
|
| 194 |
+
output_dir: Path,
|
| 195 |
+
) -> tuple[Path, str]:
|
| 196 |
+
try:
|
| 197 |
+
import wandb
|
| 198 |
+
except ImportError as exc:
|
| 199 |
+
raise ImportError(
|
| 200 |
+
"wandb is required to download missing checkpoints. Install with `uv pip install wandb`."
|
| 201 |
+
) from exc
|
| 202 |
+
|
| 203 |
+
entity, project, run_id = parse_run_url(run_url)
|
| 204 |
+
api = wandb.Api()
|
| 205 |
+
run = api.run(f"{entity}/{project}/{run_id}")
|
| 206 |
+
|
| 207 |
+
artifacts = [artifact for artifact in run.logged_artifacts() if artifact.type == "model"]
|
| 208 |
+
if not artifacts:
|
| 209 |
+
raise FileNotFoundError(
|
| 210 |
+
f"No model artifacts found for run {entity}/{project}/{run_id}."
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
artifact = artifacts[-1]
|
| 214 |
+
safe_name = artifact.name.replace("/", "_").replace(":", "_")
|
| 215 |
+
download_root = output_dir / "downloaded_checkpoints" / model_key / safe_name
|
| 216 |
+
download_root.mkdir(parents=True, exist_ok=True)
|
| 217 |
+
downloaded_dir = Path(artifact.download(root=str(download_root)))
|
| 218 |
+
|
| 219 |
+
checkpoint_path = pick_checkpoint_file(downloaded_dir, checkpoint_name)
|
| 220 |
+
return checkpoint_path, f"wandb_artifact:{artifact.name}"
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
def resolve_checkpoint_path(model_cfg: dict[str, Any], output_dir: Path) -> tuple[Path, str]:
|
| 224 |
+
checkpoint_path = Path(model_cfg.get("checkpoint_path", ""))
|
| 225 |
+
if not checkpoint_path.is_absolute():
|
| 226 |
+
checkpoint_path = (PROJECT_ROOT / checkpoint_path).resolve()
|
| 227 |
+
|
| 228 |
+
if checkpoint_path.exists():
|
| 229 |
+
return checkpoint_path, "local_path"
|
| 230 |
+
|
| 231 |
+
run_url = model_cfg.get("run_url")
|
| 232 |
+
if not run_url:
|
| 233 |
+
raise FileNotFoundError(
|
| 234 |
+
f"Checkpoint not found at {checkpoint_path} and no run_url provided for fallback download."
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
return download_checkpoint_from_wandb(
|
| 238 |
+
run_url=run_url,
|
| 239 |
+
model_key=str(model_cfg["model_key"]),
|
| 240 |
+
checkpoint_name=model_cfg.get("checkpoint_name"),
|
| 241 |
+
output_dir=output_dir,
|
| 242 |
+
)
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
def read_augmentation_profile(checkpoint_path: Path) -> str:
|
| 246 |
+
checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
|
| 247 |
+
return str(checkpoint.get("augmentation_profile", "lorentz_default"))
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def load_model(model_cfg: dict[str, Any], checkpoint_path: Path, device: str) -> LoadedModel:
|
| 251 |
+
loader = str(model_cfg["loader"])
|
| 252 |
+
if loader == "arcface_benchmark":
|
| 253 |
+
model, image_size, _metric = _load_arcface_benchmark(str(checkpoint_path), device)
|
| 254 |
+
augmentation_profile = read_augmentation_profile(checkpoint_path)
|
| 255 |
+
_train_tf, val_tf, _resolved = build_transforms(image_size, augmentation_profile=augmentation_profile)
|
| 256 |
+
return LoadedModel(model=model, val_transform=val_tf, image_size=int(image_size))
|
| 257 |
+
|
| 258 |
+
if loader == "triplet_benchmark":
|
| 259 |
+
model, image_size, _metric = _load_triplet_benchmark(str(checkpoint_path), device)
|
| 260 |
+
augmentation_profile = read_augmentation_profile(checkpoint_path)
|
| 261 |
+
_train_tf, val_tf, _resolved = build_transforms(image_size, augmentation_profile=augmentation_profile)
|
| 262 |
+
return LoadedModel(model=model, val_transform=val_tf, image_size=int(image_size))
|
| 263 |
+
|
| 264 |
+
if loader == "lorentz":
|
| 265 |
+
model, image_size, _metric, val_tf = _load_lorentz(str(checkpoint_path), device)
|
| 266 |
+
return LoadedModel(model=model, val_transform=val_tf, image_size=int(image_size))
|
| 267 |
+
|
| 268 |
+
raise ValueError(f"Unsupported loader='{loader}' in model manifest.")
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
def build_sample_rows(
|
| 272 |
+
dataset_root: Path,
|
| 273 |
+
coreset_csv: Path,
|
| 274 |
+
max_samples: int | None,
|
| 275 |
+
) -> list[dict[str, str]]:
|
| 276 |
+
train_csv = dataset_root / "train.csv"
|
| 277 |
+
images_dir = dataset_root / "train"
|
| 278 |
+
if not train_csv.exists():
|
| 279 |
+
raise FileNotFoundError(f"Missing train.csv at {train_csv}")
|
| 280 |
+
if not images_dir.exists():
|
| 281 |
+
raise FileNotFoundError(f"Missing train images directory at {images_dir}")
|
| 282 |
+
|
| 283 |
+
train_df = pd.read_csv(train_csv)
|
| 284 |
+
coreset_df = pd.read_csv(coreset_csv)
|
| 285 |
+
coreset_filenames = set(coreset_df["filename"].astype(str).tolist())
|
| 286 |
+
|
| 287 |
+
train_df = train_df.copy()
|
| 288 |
+
train_df["filename"] = train_df["filename"].astype(str)
|
| 289 |
+
train_df["ground_truth"] = train_df["ground_truth"].astype(str)
|
| 290 |
+
train_df["sample_id"] = train_df["filename"]
|
| 291 |
+
train_df["split_tag"] = np.where(train_df["filename"].isin(coreset_filenames), "validation", "train")
|
| 292 |
+
|
| 293 |
+
if max_samples is not None:
|
| 294 |
+
train_df = train_df.iloc[: int(max_samples)].copy()
|
| 295 |
+
|
| 296 |
+
rows: list[dict[str, str]] = []
|
| 297 |
+
for _, row in train_df.iterrows():
|
| 298 |
+
rows.append(
|
| 299 |
+
{
|
| 300 |
+
"sample_id": str(row["sample_id"]),
|
| 301 |
+
"filename": str(row["filename"]),
|
| 302 |
+
"label": str(row["ground_truth"]),
|
| 303 |
+
"split_tag": str(row["split_tag"]),
|
| 304 |
+
}
|
| 305 |
+
)
|
| 306 |
+
|
| 307 |
+
return rows
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
def extract_embeddings(
|
| 311 |
+
loaded_model: LoadedModel,
|
| 312 |
+
rows: list[dict[str, str]],
|
| 313 |
+
images_dir: Path,
|
| 314 |
+
image_variant: str,
|
| 315 |
+
device: torch.device,
|
| 316 |
+
batch_size: int,
|
| 317 |
+
num_workers: int,
|
| 318 |
+
progress_label: str,
|
| 319 |
+
) -> tuple[list[str], np.ndarray, list[str], list[str], list[str]]:
|
| 320 |
+
dataset = JaguarEmbeddingDataset(
|
| 321 |
+
rows=rows,
|
| 322 |
+
images_dir=images_dir,
|
| 323 |
+
transform=loaded_model.val_transform,
|
| 324 |
+
image_variant=image_variant,
|
| 325 |
+
)
|
| 326 |
+
loader = DataLoader(
|
| 327 |
+
dataset,
|
| 328 |
+
batch_size=batch_size,
|
| 329 |
+
shuffle=False,
|
| 330 |
+
num_workers=num_workers,
|
| 331 |
+
pin_memory=True,
|
| 332 |
+
)
|
| 333 |
+
|
| 334 |
+
all_vectors: list[np.ndarray] = []
|
| 335 |
+
all_ids: list[str] = []
|
| 336 |
+
all_labels: list[str] = []
|
| 337 |
+
all_filenames: list[str] = []
|
| 338 |
+
all_split_tags: list[str] = []
|
| 339 |
+
|
| 340 |
+
loaded_model.model.eval()
|
| 341 |
+
with torch.no_grad():
|
| 342 |
+
for images, sample_ids, labels, filenames, split_tags in tqdm(loader, desc=progress_label):
|
| 343 |
+
images = images.to(device, non_blocking=True)
|
| 344 |
+
vectors = loaded_model.model(images)
|
| 345 |
+
if isinstance(vectors, (tuple, list)):
|
| 346 |
+
vectors = vectors[0]
|
| 347 |
+
vectors_np = vectors.detach().cpu().numpy().astype(np.float32)
|
| 348 |
+
|
| 349 |
+
all_vectors.append(vectors_np)
|
| 350 |
+
all_ids.extend([str(x) for x in sample_ids])
|
| 351 |
+
all_labels.extend([str(x) for x in labels])
|
| 352 |
+
all_filenames.extend([str(x) for x in filenames])
|
| 353 |
+
all_split_tags.extend([str(x) for x in split_tags])
|
| 354 |
+
|
| 355 |
+
if not all_vectors:
|
| 356 |
+
raise RuntimeError("No embeddings were generated.")
|
| 357 |
+
|
| 358 |
+
stacked = np.vstack(all_vectors).astype(np.float32)
|
| 359 |
+
return all_ids, stacked, all_labels, all_filenames, all_split_tags
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
def save_model_artifacts(
|
| 363 |
+
output_dir: Path,
|
| 364 |
+
model_cfg: dict[str, Any],
|
| 365 |
+
checkpoint_path: Path,
|
| 366 |
+
checkpoint_source: str,
|
| 367 |
+
sample_ids: list[str],
|
| 368 |
+
vectors: np.ndarray,
|
| 369 |
+
labels: list[str],
|
| 370 |
+
filenames: list[str],
|
| 371 |
+
split_tags: list[str],
|
| 372 |
+
image_variant: str,
|
| 373 |
+
image_size: int,
|
| 374 |
+
batch_size: int,
|
| 375 |
+
num_workers: int,
|
| 376 |
+
) -> dict[str, Any]:
|
| 377 |
+
model_key = str(model_cfg["model_key"])
|
| 378 |
+
model_dir = output_dir / "models" / model_key
|
| 379 |
+
model_dir.mkdir(parents=True, exist_ok=True)
|
| 380 |
+
|
| 381 |
+
embeddings_path = model_dir / "embeddings.npz"
|
| 382 |
+
metadata_path = model_dir / "metadata.json"
|
| 383 |
+
|
| 384 |
+
np.savez_compressed(
|
| 385 |
+
embeddings_path,
|
| 386 |
+
ids=np.asarray(sample_ids),
|
| 387 |
+
vectors=vectors,
|
| 388 |
+
labels=np.asarray(labels),
|
| 389 |
+
filenames=np.asarray(filenames),
|
| 390 |
+
split_tags=np.asarray(split_tags),
|
| 391 |
+
)
|
| 392 |
+
|
| 393 |
+
metadata = {
|
| 394 |
+
"generated_at_utc": utc_now(),
|
| 395 |
+
"model_key": model_key,
|
| 396 |
+
"comparison_key": model_cfg.get("comparison_key"),
|
| 397 |
+
"family": model_cfg.get("family"),
|
| 398 |
+
"loader": model_cfg.get("loader"),
|
| 399 |
+
"space_key": model_cfg.get("space_key"),
|
| 400 |
+
"geometry": model_cfg.get("geometry"),
|
| 401 |
+
"layout": model_cfg.get("layout"),
|
| 402 |
+
"num_samples": int(vectors.shape[0]),
|
| 403 |
+
"embedding_dim": int(vectors.shape[1]),
|
| 404 |
+
"checkpoint_path": str(checkpoint_path),
|
| 405 |
+
"checkpoint_source": checkpoint_source,
|
| 406 |
+
"run_url": model_cfg.get("run_url"),
|
| 407 |
+
"image_variant": image_variant,
|
| 408 |
+
"image_size": int(image_size),
|
| 409 |
+
"batch_size": int(batch_size),
|
| 410 |
+
"num_workers": int(num_workers),
|
| 411 |
+
}
|
| 412 |
+
metadata_path.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
|
| 413 |
+
|
| 414 |
+
return {
|
| 415 |
+
"model_key": model_key,
|
| 416 |
+
"comparison_key": model_cfg.get("comparison_key"),
|
| 417 |
+
"family": model_cfg.get("family"),
|
| 418 |
+
"loader": model_cfg.get("loader"),
|
| 419 |
+
"space_key": model_cfg.get("space_key"),
|
| 420 |
+
"geometry": model_cfg.get("geometry"),
|
| 421 |
+
"layout": model_cfg.get("layout"),
|
| 422 |
+
"checkpoint_path": str(checkpoint_path),
|
| 423 |
+
"checkpoint_source": checkpoint_source,
|
| 424 |
+
"run_url": model_cfg.get("run_url"),
|
| 425 |
+
"embeddings_path": str(embeddings_path.relative_to(output_dir)),
|
| 426 |
+
"metadata_path": str(metadata_path.relative_to(output_dir)),
|
| 427 |
+
"num_samples": int(vectors.shape[0]),
|
| 428 |
+
"embedding_dim": int(vectors.shape[1]),
|
| 429 |
+
}
|
| 430 |
+
|
| 431 |
+
|
| 432 |
+
def write_sample_index(output_dir: Path, rows: list[dict[str, str]]) -> Path:
|
| 433 |
+
sample_index_path = output_dir / "sample_index.csv"
|
| 434 |
+
sample_df = pd.DataFrame(rows)
|
| 435 |
+
sample_df.to_csv(sample_index_path, index=False)
|
| 436 |
+
return sample_index_path
|
| 437 |
+
|
| 438 |
+
|
| 439 |
+
def main() -> int:
|
| 440 |
+
args = parse_args()
|
| 441 |
+
device = resolve_device(args.device)
|
| 442 |
+
|
| 443 |
+
model_manifest = load_model_manifest(args.model_manifest)
|
| 444 |
+
output_dir = args.output_dir.resolve()
|
| 445 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 446 |
+
|
| 447 |
+
dataset_root = args.dataset_root.resolve()
|
| 448 |
+
images_dir = dataset_root / "train"
|
| 449 |
+
rows = build_sample_rows(
|
| 450 |
+
dataset_root=dataset_root,
|
| 451 |
+
coreset_csv=args.coreset_csv,
|
| 452 |
+
max_samples=args.max_samples,
|
| 453 |
+
)
|
| 454 |
+
if not rows:
|
| 455 |
+
raise RuntimeError("No rows found in train.csv after applying filters.")
|
| 456 |
+
|
| 457 |
+
expected_ids = [row["sample_id"] for row in rows]
|
| 458 |
+
sample_index_path = write_sample_index(output_dir, rows)
|
| 459 |
+
|
| 460 |
+
emitted_models: list[dict[str, Any]] = []
|
| 461 |
+
for model_cfg in model_manifest["models"]:
|
| 462 |
+
model_key = str(model_cfg["model_key"])
|
| 463 |
+
print(f"\n=== Building embeddings for {model_key} ===")
|
| 464 |
+
|
| 465 |
+
checkpoint_path, checkpoint_source = resolve_checkpoint_path(model_cfg=model_cfg, output_dir=output_dir)
|
| 466 |
+
print(f"Checkpoint: {checkpoint_path} ({checkpoint_source})")
|
| 467 |
+
|
| 468 |
+
loaded_model = load_model(model_cfg=model_cfg, checkpoint_path=checkpoint_path, device=args.device)
|
| 469 |
+
ids, vectors, labels, filenames, split_tags = extract_embeddings(
|
| 470 |
+
loaded_model=loaded_model,
|
| 471 |
+
rows=rows,
|
| 472 |
+
images_dir=images_dir,
|
| 473 |
+
image_variant=args.image_variant,
|
| 474 |
+
device=device,
|
| 475 |
+
batch_size=int(args.batch_size),
|
| 476 |
+
num_workers=int(args.num_workers),
|
| 477 |
+
progress_label=f"extract:{model_key}",
|
| 478 |
+
)
|
| 479 |
+
|
| 480 |
+
if ids != expected_ids:
|
| 481 |
+
raise RuntimeError(
|
| 482 |
+
f"Sample ID alignment failed for {model_key}: extracted order does not match expected sample index."
|
| 483 |
+
)
|
| 484 |
+
|
| 485 |
+
emitted = save_model_artifacts(
|
| 486 |
+
output_dir=output_dir,
|
| 487 |
+
model_cfg=model_cfg,
|
| 488 |
+
checkpoint_path=checkpoint_path,
|
| 489 |
+
checkpoint_source=checkpoint_source,
|
| 490 |
+
sample_ids=ids,
|
| 491 |
+
vectors=vectors,
|
| 492 |
+
labels=labels,
|
| 493 |
+
filenames=filenames,
|
| 494 |
+
split_tags=split_tags,
|
| 495 |
+
image_variant=args.image_variant,
|
| 496 |
+
image_size=loaded_model.image_size,
|
| 497 |
+
batch_size=int(args.batch_size),
|
| 498 |
+
num_workers=int(args.num_workers),
|
| 499 |
+
)
|
| 500 |
+
emitted_models.append(emitted)
|
| 501 |
+
|
| 502 |
+
manifest_out = {
|
| 503 |
+
"generated_at_utc": utc_now(),
|
| 504 |
+
"source_model_manifest": str(args.model_manifest.resolve()),
|
| 505 |
+
"dataset": {
|
| 506 |
+
"dataset_root": str(dataset_root),
|
| 507 |
+
"images_dir": str(images_dir),
|
| 508 |
+
"coreset_csv": str(args.coreset_csv.resolve()),
|
| 509 |
+
"num_samples": len(rows),
|
| 510 |
+
"image_variant": args.image_variant,
|
| 511 |
+
"sample_index_csv": str(sample_index_path.relative_to(output_dir)),
|
| 512 |
+
},
|
| 513 |
+
"models": emitted_models,
|
| 514 |
+
}
|
| 515 |
+
|
| 516 |
+
manifest_path = output_dir / "manifest.json"
|
| 517 |
+
manifest_path.write_text(json.dumps(manifest_out, indent=2), encoding="utf-8")
|
| 518 |
+
|
| 519 |
+
print("\n=== HyperView asset build complete ===")
|
| 520 |
+
print(f"Sample count: {len(rows)}")
|
| 521 |
+
print(f"Manifest: {manifest_path}")
|
| 522 |
+
for emitted in emitted_models:
|
| 523 |
+
print(
|
| 524 |
+
f"- {emitted['model_key']}: {emitted['num_samples']} x {emitted['embedding_dim']} "
|
| 525 |
+
f"({emitted['embeddings_path']})"
|
| 526 |
+
)
|
| 527 |
+
|
| 528 |
+
return 0
|
| 529 |
+
|
| 530 |
+
|
| 531 |
+
if __name__ == "__main__":
|
| 532 |
+
raise SystemExit(main())
|
scripts/publish_hyperview_demo_dataset.py
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Create and optionally publish a resized HF dataset for the HyperView Space."""
|
| 3 |
+
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import argparse
|
| 7 |
+
import json
|
| 8 |
+
import os
|
| 9 |
+
from datetime import datetime, timezone
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
import numpy as np
|
| 13 |
+
import pandas as pd
|
| 14 |
+
from PIL import Image
|
| 15 |
+
from datasets import Dataset, Image as HFImage
|
| 16 |
+
|
| 17 |
+
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
| 18 |
+
DEFAULT_DATASET_ROOT = PROJECT_ROOT / "kaggle_jaguar_dataset_v2"
|
| 19 |
+
DEFAULT_CORESET_CSV = PROJECT_ROOT / "data/validation_coreset.csv"
|
| 20 |
+
DEFAULT_OUTPUT_DIR = PROJECT_ROOT / "HyperViewDemoHuggingFaceSpace/dataset_build"
|
| 21 |
+
DEFAULT_REPO_ID = os.environ.get("HF_DATASET_REPO", "hyper3labs/jaguar-hyperview-demo")
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def utc_now() -> str:
|
| 25 |
+
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def parse_args() -> argparse.Namespace:
|
| 29 |
+
parser = argparse.ArgumentParser(
|
| 30 |
+
description="Build resized train+validation demo dataset with split tags for HyperView."
|
| 31 |
+
)
|
| 32 |
+
parser.add_argument("--dataset_root", type=Path, default=DEFAULT_DATASET_ROOT)
|
| 33 |
+
parser.add_argument("--coreset_csv", type=Path, default=DEFAULT_CORESET_CSV)
|
| 34 |
+
parser.add_argument("--output_dir", type=Path, default=DEFAULT_OUTPUT_DIR)
|
| 35 |
+
parser.add_argument("--repo_id", type=str, default=DEFAULT_REPO_ID)
|
| 36 |
+
parser.add_argument("--config_name", type=str, default="default")
|
| 37 |
+
parser.add_argument("--image_size", type=int, default=384)
|
| 38 |
+
parser.add_argument("--jpeg_quality", type=int, default=90)
|
| 39 |
+
parser.add_argument(
|
| 40 |
+
"--image_variant",
|
| 41 |
+
type=str,
|
| 42 |
+
default="foreground_only",
|
| 43 |
+
choices=["foreground_only", "full_rgb"],
|
| 44 |
+
)
|
| 45 |
+
parser.add_argument("--max_samples", type=int, default=None)
|
| 46 |
+
parser.add_argument("--private", action="store_true")
|
| 47 |
+
parser.add_argument("--hf_token_env", type=str, default="HF_TOKEN")
|
| 48 |
+
parser.add_argument("--no_push", action="store_true")
|
| 49 |
+
return parser.parse_args()
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def load_rows(dataset_root: Path, coreset_csv: Path, max_samples: int | None) -> pd.DataFrame:
|
| 53 |
+
train_csv = dataset_root / "train.csv"
|
| 54 |
+
if not train_csv.exists():
|
| 55 |
+
raise FileNotFoundError(f"Missing train.csv at {train_csv}")
|
| 56 |
+
|
| 57 |
+
train_df = pd.read_csv(train_csv)
|
| 58 |
+
coreset_df = pd.read_csv(coreset_csv)
|
| 59 |
+
coreset_filenames = set(coreset_df["filename"].astype(str).tolist())
|
| 60 |
+
|
| 61 |
+
train_df = train_df.copy()
|
| 62 |
+
train_df["filename"] = train_df["filename"].astype(str)
|
| 63 |
+
train_df["label"] = train_df["ground_truth"].astype(str)
|
| 64 |
+
train_df["split_tag"] = np.where(train_df["filename"].isin(coreset_filenames), "validation", "train")
|
| 65 |
+
train_df["sample_id"] = train_df["filename"]
|
| 66 |
+
|
| 67 |
+
if max_samples is not None:
|
| 68 |
+
train_df = train_df.iloc[: int(max_samples)].copy()
|
| 69 |
+
|
| 70 |
+
return train_df[["filename", "label", "split_tag", "sample_id"]]
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def load_variant_image(image_path: Path, image_variant: str) -> Image.Image:
|
| 74 |
+
if image_variant == "foreground_only":
|
| 75 |
+
rgba = Image.open(image_path).convert("RGBA")
|
| 76 |
+
rgba_np = np.array(rgba, dtype=np.uint8)
|
| 77 |
+
rgb = rgba_np[:, :, :3]
|
| 78 |
+
alpha = rgba_np[:, :, 3]
|
| 79 |
+
mask = (alpha > 0).astype(np.uint8)
|
| 80 |
+
cutout_rgb = (rgb * mask[:, :, np.newaxis]).astype(np.uint8)
|
| 81 |
+
return Image.fromarray(cutout_rgb, mode="RGB")
|
| 82 |
+
return Image.open(image_path).convert("RGB")
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def build_resized_images(
|
| 86 |
+
rows_df: pd.DataFrame,
|
| 87 |
+
dataset_root: Path,
|
| 88 |
+
output_images_dir: Path,
|
| 89 |
+
image_size: int,
|
| 90 |
+
jpeg_quality: int,
|
| 91 |
+
image_variant: str,
|
| 92 |
+
) -> pd.DataFrame:
|
| 93 |
+
source_images_dir = dataset_root / "train"
|
| 94 |
+
if not source_images_dir.exists():
|
| 95 |
+
raise FileNotFoundError(f"Missing image directory: {source_images_dir}")
|
| 96 |
+
|
| 97 |
+
output_images_dir.mkdir(parents=True, exist_ok=True)
|
| 98 |
+
|
| 99 |
+
records: list[dict[str, str]] = []
|
| 100 |
+
for _, row in rows_df.iterrows():
|
| 101 |
+
filename = str(row["filename"])
|
| 102 |
+
src = source_images_dir / filename
|
| 103 |
+
if not src.exists():
|
| 104 |
+
raise FileNotFoundError(f"Missing source image: {src}")
|
| 105 |
+
|
| 106 |
+
image = load_variant_image(src, image_variant=image_variant)
|
| 107 |
+
image = image.resize((int(image_size), int(image_size)), Image.Resampling.BICUBIC)
|
| 108 |
+
|
| 109 |
+
dst_name = f"{Path(filename).stem}.jpg"
|
| 110 |
+
dst = output_images_dir / dst_name
|
| 111 |
+
image.save(dst, format="JPEG", quality=int(jpeg_quality), optimize=True)
|
| 112 |
+
|
| 113 |
+
records.append(
|
| 114 |
+
{
|
| 115 |
+
"image": str(dst),
|
| 116 |
+
"label": str(row["label"]),
|
| 117 |
+
"filename": filename,
|
| 118 |
+
"split_tag": str(row["split_tag"]),
|
| 119 |
+
"sample_id": str(row["sample_id"]),
|
| 120 |
+
}
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
return pd.DataFrame(records)
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def build_hf_dataset(records_df: pd.DataFrame) -> Dataset:
|
| 127 |
+
payload = {
|
| 128 |
+
"image": records_df["image"].tolist(),
|
| 129 |
+
"label": records_df["label"].tolist(),
|
| 130 |
+
"filename": records_df["filename"].tolist(),
|
| 131 |
+
"split_tag": records_df["split_tag"].tolist(),
|
| 132 |
+
"sample_id": records_df["sample_id"].tolist(),
|
| 133 |
+
}
|
| 134 |
+
dataset = Dataset.from_dict(payload)
|
| 135 |
+
dataset = dataset.cast_column("image", HFImage())
|
| 136 |
+
return dataset
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def maybe_push_to_hub(
|
| 140 |
+
dataset: Dataset,
|
| 141 |
+
repo_id: str,
|
| 142 |
+
config_name: str,
|
| 143 |
+
private: bool,
|
| 144 |
+
hf_token_env: str,
|
| 145 |
+
no_push: bool,
|
| 146 |
+
) -> str:
|
| 147 |
+
if no_push:
|
| 148 |
+
return "skipped (--no_push)"
|
| 149 |
+
|
| 150 |
+
token = os.environ.get(hf_token_env)
|
| 151 |
+
if not token:
|
| 152 |
+
raise RuntimeError(
|
| 153 |
+
f"Missing Hugging Face token in environment variable {hf_token_env}."
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
dataset.push_to_hub(
|
| 157 |
+
repo_id=repo_id,
|
| 158 |
+
config_name=config_name,
|
| 159 |
+
token=token,
|
| 160 |
+
private=bool(private),
|
| 161 |
+
)
|
| 162 |
+
return f"pushed:{repo_id}:{config_name}"
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def main() -> int:
|
| 166 |
+
args = parse_args()
|
| 167 |
+
|
| 168 |
+
output_dir = args.output_dir.resolve()
|
| 169 |
+
images_out = output_dir / "images"
|
| 170 |
+
dataset_out = output_dir / "hf_dataset"
|
| 171 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 172 |
+
|
| 173 |
+
rows_df = load_rows(
|
| 174 |
+
dataset_root=args.dataset_root.resolve(),
|
| 175 |
+
coreset_csv=args.coreset_csv.resolve(),
|
| 176 |
+
max_samples=args.max_samples,
|
| 177 |
+
)
|
| 178 |
+
if rows_df.empty:
|
| 179 |
+
raise RuntimeError("No dataset rows found for publish pipeline.")
|
| 180 |
+
|
| 181 |
+
records_df = build_resized_images(
|
| 182 |
+
rows_df=rows_df,
|
| 183 |
+
dataset_root=args.dataset_root.resolve(),
|
| 184 |
+
output_images_dir=images_out,
|
| 185 |
+
image_size=int(args.image_size),
|
| 186 |
+
jpeg_quality=int(args.jpeg_quality),
|
| 187 |
+
image_variant=args.image_variant,
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
dataset = build_hf_dataset(records_df)
|
| 191 |
+
dataset.save_to_disk(str(dataset_out))
|
| 192 |
+
|
| 193 |
+
publish_status = maybe_push_to_hub(
|
| 194 |
+
dataset=dataset,
|
| 195 |
+
repo_id=args.repo_id,
|
| 196 |
+
config_name=args.config_name,
|
| 197 |
+
private=args.private,
|
| 198 |
+
hf_token_env=args.hf_token_env,
|
| 199 |
+
no_push=args.no_push,
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
metadata = {
|
| 203 |
+
"generated_at_utc": utc_now(),
|
| 204 |
+
"dataset_root": str(args.dataset_root.resolve()),
|
| 205 |
+
"coreset_csv": str(args.coreset_csv.resolve()),
|
| 206 |
+
"output_dir": str(output_dir),
|
| 207 |
+
"repo_id": args.repo_id,
|
| 208 |
+
"config_name": args.config_name,
|
| 209 |
+
"image_size": int(args.image_size),
|
| 210 |
+
"jpeg_quality": int(args.jpeg_quality),
|
| 211 |
+
"image_variant": args.image_variant,
|
| 212 |
+
"num_rows": int(len(records_df)),
|
| 213 |
+
"split_counts": records_df["split_tag"].value_counts().to_dict(),
|
| 214 |
+
"push_status": publish_status,
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
metadata_path = output_dir / "publish_metadata.json"
|
| 218 |
+
metadata_path.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
|
| 219 |
+
|
| 220 |
+
print("=== HyperView demo dataset pipeline complete ===")
|
| 221 |
+
print(f"Rows: {len(records_df)}")
|
| 222 |
+
print(f"HF dataset saved to: {dataset_out}")
|
| 223 |
+
print(f"Push status: {publish_status}")
|
| 224 |
+
print(f"Metadata: {metadata_path}")
|
| 225 |
+
return 0
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
if __name__ == "__main__":
|
| 229 |
+
raise SystemExit(main())
|