andandandand's picture
Deploy cold-start reliability update (source: 85cf4fa)
d110c29 verified
#!/usr/bin/env python3
"""HyperView Space runtime for core-claims top jaguar ReID models."""
from __future__ import annotations
import json
import os
import re
import threading
import time
import traceback
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import numpy as np
from datasets import Dataset as HFDataset
from datasets import DatasetDict as HFDatasetDict
from datasets import load_dataset, load_from_disk
import hyperview as hv
from hyperview.api import Session
from hyperview.core.sample import Sample
SPACE_HOST = os.environ.get("SPACE_HOST", "0.0.0.0")
LOCAL_BIND_HOSTS = {"0.0.0.0", "127.0.0.1", "localhost", "::", "::1"}
DATASET_NAME = os.environ.get("HYPERVIEW_DATASET_NAME", "jaguar_core_claims_demo")
HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "hyper3labs/jaguar-hyperview-demo")
HF_DATASET_CONFIG = os.environ.get("HF_DATASET_CONFIG", "default")
HF_DATASET_SPLIT = os.environ.get("HF_DATASET_SPLIT", "train")
EMBEDDING_ASSET_DIR = Path(
os.environ.get(
"EMBEDDING_ASSET_DIR",
str((Path(__file__).resolve().parent / "assets").resolve()),
)
)
ASSET_MANIFEST_PATH = Path(
os.environ.get("EMBEDDING_ASSET_MANIFEST", str((EMBEDDING_ASSET_DIR / "manifest.json").resolve()))
)
DEFAULT_STARTUP_MODE = "serve_fast"
DEFAULT_FAILURE_POLICY = "exit"
DEFAULT_BATCH_INSERT_SIZE = 500
DEFAULT_WARMUP_STATUS_PATH = Path("/tmp/hyperview_warmup_status.json")
def _utc_now() -> str:
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
def _resolve_startup_mode() -> str:
startup_mode = os.environ.get("HYPERVIEW_STARTUP_MODE", DEFAULT_STARTUP_MODE).strip().lower()
if startup_mode in {"serve_fast", "blocking"}:
return startup_mode
print(
f"Invalid HYPERVIEW_STARTUP_MODE={startup_mode!r}; "
f"falling back to {DEFAULT_STARTUP_MODE!r}."
)
return DEFAULT_STARTUP_MODE
def _resolve_failure_policy() -> str:
failure_policy = os.environ.get("HYPERVIEW_WARMUP_FAILURE_POLICY", DEFAULT_FAILURE_POLICY).strip().lower()
if failure_policy in {"exit", "warn"}:
return failure_policy
print(
f"Invalid HYPERVIEW_WARMUP_FAILURE_POLICY={failure_policy!r}; "
f"falling back to {DEFAULT_FAILURE_POLICY!r}."
)
return DEFAULT_FAILURE_POLICY
def _resolve_batch_insert_size() -> int:
raw_value = os.environ.get("HYPERVIEW_BATCH_INSERT_SIZE", str(DEFAULT_BATCH_INSERT_SIZE)).strip()
try:
batch_size = int(raw_value)
except ValueError as exc:
raise ValueError(f"Invalid integer value for HYPERVIEW_BATCH_INSERT_SIZE: {raw_value}") from exc
if batch_size <= 0:
raise ValueError(f"HYPERVIEW_BATCH_INSERT_SIZE must be > 0; got {batch_size}")
return batch_size
def _resolve_warmup_status_path() -> Path:
raw = os.environ.get("HYPERVIEW_WARMUP_STATUS_PATH")
if raw is None:
return DEFAULT_WARMUP_STATUS_PATH
return Path(raw)
class WarmupStatusTracker:
"""Tracks warmup state and persists it to a JSON status file."""
def __init__(self, status_path: Path):
self._status_path = status_path
self._lock = threading.Lock()
now = _utc_now()
self._state: dict[str, Any] = {
"status": "starting",
"phase": "boot",
"counts": {},
"error": None,
"timestamps": {
"started_at": now,
"updated_at": now,
},
}
self._persist_locked()
def update(
self,
*,
status: str | None = None,
phase: str | None = None,
counts: dict[str, Any] | None = None,
error: dict[str, Any] | None = None,
) -> None:
with self._lock:
now = _utc_now()
if status is not None:
self._state["status"] = status
if phase is not None:
self._state["phase"] = phase
if counts:
self._state["counts"].update(counts)
if error is not None:
self._state["error"] = error
self._state["timestamps"]["updated_at"] = now
if status == "ready":
self._state["timestamps"]["ready_at"] = now
if status == "failed":
self._state["timestamps"]["failed_at"] = now
self._persist_locked()
@property
def path(self) -> Path:
return self._status_path
def _persist_locked(self) -> None:
try:
self._status_path.parent.mkdir(parents=True, exist_ok=True)
tmp_path = self._status_path.with_name(f"{self._status_path.name}.tmp")
tmp_path.write_text(json.dumps(self._state, indent=2, sort_keys=True), encoding="utf-8")
tmp_path.replace(self._status_path)
except OSError as exc:
print(f"Warmup status warning: failed writing status JSON to {self._status_path} ({exc})")
def _patch_hyperview_default_panel() -> None:
"""Patch HyperView 0.3.1 frontend for default panel and dock cache-key migration.
HyperView currently has no public API for these behaviors. This runtime patch is
intentionally narrow and idempotent, targeting the known bundled chunk for v0.3.1.
"""
default_panel = os.environ.get("HYPERVIEW_DEFAULT_PANEL", "spherical3d").strip().lower()
apply_default_panel_patch = default_panel in {"spherical3d", "sphere3d"}
if not apply_default_panel_patch:
print(f"Skipping frontend default-panel patch (HYPERVIEW_DEFAULT_PANEL={default_panel!r}).")
cache_version = os.environ.get("HYPERVIEW_LAYOUT_CACHE_VERSION", "v6").strip() or "v6"
target_layout_key = f"hyperview:dockview-layout:{cache_version}"
legacy_layout_key = "hyperview:dockview-layout:v5"
layout_key_pattern = r"hyperview:dockview-layout:v\d+"
chunk_path = (
Path(hv.__file__).resolve().parent
/ "server"
/ "static"
/ "_next"
/ "static"
/ "chunks"
/ "077b38561d6ea80d.js"
)
if not chunk_path.exists():
print(f"Default-panel patch skipped: chunk not found at {chunk_path}")
return
marker_before = 'v||(v=n)};if(f&&l&&w({id:dr,title:"Euclidean"'
marker_after = 'v||(v=n),t.id===dd&&n.api.setActive()};if(f&&l&&w({id:dr,title:"Euclidean"'
try:
payload = chunk_path.read_text(encoding="utf-8")
except OSError as exc:
print(f"Default-panel patch skipped: failed reading chunk ({exc})")
return
patched = payload
changed = False
if apply_default_panel_patch:
if marker_after in patched:
print("HyperView frontend already patched for Sphere 3D default panel.")
elif marker_before in patched:
patched = patched.replace(marker_before, marker_after, 1)
changed = True
print("Patched HyperView frontend: Sphere 3D will open as default scatter panel.")
else:
print("Default-panel patch skipped: expected marker not found in HyperView chunk.")
if target_layout_key in patched:
print(f"HyperView frontend already uses dock cache key '{target_layout_key}'.")
elif legacy_layout_key in patched:
patched = patched.replace(legacy_layout_key, target_layout_key, 1)
changed = True
print(f"Patched HyperView frontend: dock cache key {legacy_layout_key} -> {target_layout_key}.")
else:
discovered = re.search(layout_key_pattern, patched)
if discovered:
source_key = discovered.group(0)
if source_key == target_layout_key:
print(f"HyperView frontend already uses dock cache key '{target_layout_key}'.")
else:
print(
f"Dock cache patch notice: expected legacy key '{legacy_layout_key}' not found; "
f"migrating detected key '{source_key}' -> '{target_layout_key}'."
)
patched = patched.replace(source_key, target_layout_key, 1)
changed = True
else:
print(
"Dock cache patch warning: expected layout cache key marker "
f"'{legacy_layout_key}' not found in HyperView chunk."
)
if not changed:
return
try:
chunk_path.write_text(patched, encoding="utf-8")
except OSError as exc:
print(f"Frontend patch skipped: failed writing chunk ({exc})")
def _resolve_bind_host() -> tuple[str, str | None]:
explicit_bind = os.environ.get("HYPERVIEW_BIND_HOST")
if explicit_bind:
return explicit_bind, None
if SPACE_HOST in LOCAL_BIND_HOSTS:
return SPACE_HOST, None
return "0.0.0.0", f"SPACE_HOST='{SPACE_HOST}' is non-local; falling back to 0.0.0.0"
def _resolve_port() -> int:
for key in ("SPACE_PORT", "PORT"):
value = os.environ.get(key)
if value:
try:
return int(value)
except ValueError as exc:
raise ValueError(f"Invalid integer value for {key}: {value}") from exc
return 7860
def load_asset_manifest(path: Path) -> dict[str, Any]:
if not path.exists():
raise FileNotFoundError(
f"Embedding asset manifest not found: {path}. "
"Run scripts/build_hyperview_demo_assets.py first."
)
payload = json.loads(path.read_text(encoding="utf-8"))
if "models" not in payload or not isinstance(payload["models"], list):
raise ValueError(f"Invalid asset manifest format: {path}")
return payload
def _load_hf_rows() -> HFDataset:
repo_path = Path(HF_DATASET_REPO)
if repo_path.exists():
loaded = load_from_disk(str(repo_path))
if isinstance(loaded, HFDatasetDict):
if HF_DATASET_SPLIT in loaded:
return loaded[HF_DATASET_SPLIT]
if "train" in loaded:
return loaded["train"]
first_split = next(iter(loaded.keys()))
return loaded[first_split]
return loaded
return load_dataset(HF_DATASET_REPO, name=HF_DATASET_CONFIG, split=HF_DATASET_SPLIT)
def ingest_hf_dataset_samples(dataset: hv.Dataset, batch_insert_size: int | None = None) -> dict[str, int]:
rows = _load_hf_rows()
media_root = Path(os.environ.get("HYPERVIEW_MEDIA_DIR", "./demo_data/media")) / DATASET_NAME
media_root.mkdir(parents=True, exist_ok=True)
effective_batch_size = _resolve_batch_insert_size() if batch_insert_size is None else int(batch_insert_size)
if effective_batch_size <= 0:
raise ValueError(f"batch_insert_size must be > 0; got {effective_batch_size}")
records_by_id: dict[str, dict[str, Any]] = {}
duplicate_ids = 0
for index, row in enumerate(rows):
filename = str(row.get("filename", f"sample_{index:06d}.jpg"))
sample_id = str(row.get("sample_id", filename))
if sample_id in records_by_id:
duplicate_ids += 1
continue
records_by_id[sample_id] = {
"index": index,
"filename": filename,
"sample_id": sample_id,
"label": str(row.get("label", "")),
"split_tag": str(row.get("split_tag", "unknown")),
}
candidate_records = list(records_by_id.values())
candidate_ids = [record["sample_id"] for record in candidate_records]
existing_ids = dataset._storage.get_existing_ids(candidate_ids) if candidate_ids else set()
missing_records = [record for record in candidate_records if record["sample_id"] not in existing_ids]
print(
"HF ingestion plan: "
f"candidates={len(candidate_records)} existing={len(existing_ids)} "
f"missing={len(missing_records)} duplicates={duplicate_ids} "
f"batch_insert_size={effective_batch_size}"
)
added = 0
saved_images = 0
pending_samples: list[Sample] = []
def flush_pending_samples() -> None:
nonlocal added
if not pending_samples:
return
dataset._storage.add_samples_batch(pending_samples)
added += len(pending_samples)
print(f"Inserted sample batch: size={len(pending_samples)} total_inserted={added}")
pending_samples.clear()
for record in missing_records:
sample_id = str(record["sample_id"])
filename = str(record["filename"])
row = rows[int(record["index"])]
image_obj = row["image"]
image_path = media_root / f"{Path(sample_id).stem}.jpg"
if not image_path.exists():
image_obj.convert("RGB").save(image_path, format="JPEG", quality=90, optimize=True)
saved_images += 1
metadata = {
"filename": filename,
"sample_id": sample_id,
"split_tag": str(record["split_tag"]),
"identity": str(record["label"]),
"source_repo": HF_DATASET_REPO,
"source_config": HF_DATASET_CONFIG,
"source_split": HF_DATASET_SPLIT,
}
pending_samples.append(
Sample(
id=sample_id,
filepath=str(image_path),
label=str(record["label"]),
metadata=metadata,
)
)
if len(pending_samples) >= effective_batch_size:
flush_pending_samples()
flush_pending_samples()
print(
f"Ingested {added} HF samples into HyperView dataset '{DATASET_NAME}' "
f"(saved_images={saved_images}, existing={len(existing_ids)})."
)
return {
"candidates": len(candidate_records),
"existing": len(existing_ids),
"added": added,
"saved_images": saved_images,
"duplicates": duplicate_ids,
}
def ensure_embedding_spaces(dataset: hv.Dataset, asset_manifest: dict[str, Any], asset_dir: Path) -> None:
known_sample_ids = {sample.id for sample in dataset.samples}
for model in asset_manifest["models"]:
model_key = str(model["model_key"])
space_key = str(model["space_key"])
embeddings_rel = model.get("embeddings_path")
if not embeddings_rel:
raise ValueError(f"Missing embeddings_path in asset manifest for model {model_key}")
embeddings_path = asset_dir / str(embeddings_rel)
if not embeddings_path.exists():
raise FileNotFoundError(
f"Missing embeddings file for model {model_key}: {embeddings_path}"
)
payload = np.load(embeddings_path, allow_pickle=False)
ids = [str(x) for x in payload["ids"].tolist()]
vectors = np.asarray(payload["vectors"], dtype=np.float32)
if vectors.ndim != 2:
raise ValueError(f"Embeddings for {model_key} must be 2D; got {vectors.shape}")
if len(ids) != vectors.shape[0]:
raise ValueError(
f"Embeddings/ID mismatch for {model_key}: {len(ids)} ids vs {vectors.shape[0]} vectors"
)
missing_ids = sorted(set(ids) - known_sample_ids)
if missing_ids:
preview = ", ".join(missing_ids[:5])
raise RuntimeError(
f"Embedding IDs missing from loaded dataset for {model_key}. "
f"First missing IDs: {preview}"
)
config = {
"provider": "precomputed-checkpoint",
"geometry": str(model["geometry"]),
"comparison_key": model.get("comparison_key"),
"family": model.get("family"),
"checkpoint_path": model.get("checkpoint_path"),
}
dataset._storage.ensure_space(
model_id=model_key,
dim=int(vectors.shape[1]),
space_key=space_key,
config=config,
)
dataset._storage.add_embeddings(space_key, ids, vectors)
print(f"Ensured space {space_key} ({vectors.shape[0]} x {vectors.shape[1]})")
def ensure_layouts(dataset: hv.Dataset, asset_manifest: dict[str, Any]) -> list[str]:
layout_keys: list[str] = []
for model in asset_manifest["models"]:
space_key = str(model["space_key"])
layout_spec = str(model.get("layout", "euclidean:2d"))
layout_key = dataset.compute_visualization(
space_key=space_key,
layout=layout_spec,
method="umap",
force=False,
)
layout_keys.append(layout_key)
print(f"Ensured layout {layout_key} for space={space_key}")
return layout_keys
def _run_warmup(dataset: hv.Dataset, tracker: WarmupStatusTracker) -> None:
asset_manifest = load_asset_manifest(ASSET_MANIFEST_PATH)
tracker.update(
status="running",
phase="ingest",
counts={"manifest_models": len(asset_manifest.get("models", []))},
)
batch_insert_size = _resolve_batch_insert_size()
if len(dataset) == 0:
print(
f"Loading HF dataset rows from {HF_DATASET_REPO}[{HF_DATASET_CONFIG}] split={HF_DATASET_SPLIT}"
)
ingest_stats = ingest_hf_dataset_samples(dataset, batch_insert_size=batch_insert_size)
else:
ingest_stats = {
"candidates": len(dataset),
"existing": len(dataset),
"added": 0,
"saved_images": 0,
"duplicates": 0,
}
print(f"Skipping HF ingestion because dataset '{DATASET_NAME}' already has {len(dataset)} samples.")
tracker.update(
counts={
"batch_insert_size": batch_insert_size,
"dataset_samples": len(dataset),
**ingest_stats,
}
)
tracker.update(phase="spaces")
ensure_embedding_spaces(dataset, asset_manifest=asset_manifest, asset_dir=EMBEDDING_ASSET_DIR)
tracker.update(counts={"spaces": len(dataset.list_spaces())})
tracker.update(phase="layouts")
layout_keys = ensure_layouts(dataset, asset_manifest=asset_manifest)
tracker.update(
status="ready",
phase="ready",
counts={
"dataset_samples": len(dataset),
"spaces": len(dataset.list_spaces()),
"layouts": len(layout_keys),
},
)
print(f"Dataset '{DATASET_NAME}' has {len(dataset)} samples")
print(f"Spaces: {[space.space_key for space in dataset.list_spaces()]}")
print(f"Layouts: {layout_keys}")
def _run_warmup_blocking(dataset: hv.Dataset, tracker: WarmupStatusTracker) -> None:
try:
_run_warmup(dataset, tracker)
except Exception as exc:
tb = traceback.format_exc()
tracker.update(
status="failed",
phase="failed",
error={
"type": type(exc).__name__,
"message": str(exc),
"traceback": tb,
},
)
print(tb)
raise
def _warmup_worker(
dataset: hv.Dataset,
tracker: WarmupStatusTracker,
failure_policy: str,
failure_event: threading.Event,
failure_holder: dict[str, str],
) -> None:
try:
_run_warmup(dataset, tracker)
except Exception as exc:
tb = traceback.format_exc()
tracker.update(
status="failed",
phase="failed",
error={
"type": type(exc).__name__,
"message": str(exc),
"traceback": tb,
},
)
print("Warmup failed:")
print(tb)
failure_holder["error"] = f"{type(exc).__name__}: {exc}"
if failure_policy == "exit":
failure_event.set()
def _start_server_session(dataset: hv.Dataset, bind_host: str, bind_port: int) -> Session:
session = Session(dataset, host=bind_host, port=bind_port)
session.start(background=True)
print(f"HyperView server is running at {session.url}")
return session
def _serve_forever(
session: Session,
*,
failure_event: threading.Event | None = None,
failure_holder: dict[str, str] | None = None,
) -> None:
try:
while True:
time.sleep(0.25)
if session._server_thread is not None and not session._server_thread.is_alive():
raise RuntimeError("HyperView server stopped unexpectedly.")
if failure_event is not None and failure_event.is_set():
reason = None
if failure_holder is not None:
reason = failure_holder.get("error")
if reason:
raise RuntimeError(f"Warmup failed and failure policy is 'exit': {reason}")
raise RuntimeError("Warmup failed and failure policy is 'exit'.")
except KeyboardInterrupt:
pass
finally:
session.stop()
if session._server_thread is not None:
session._server_thread.join(timeout=2.0)
def main() -> None:
_patch_hyperview_default_panel()
startup_mode = _resolve_startup_mode()
failure_policy = _resolve_failure_policy()
warmup_status_path = _resolve_warmup_status_path()
dataset = hv.Dataset(DATASET_NAME)
tracker = WarmupStatusTracker(warmup_status_path)
tracker.update(
counts={
"dataset_samples": len(dataset),
"startup_mode": startup_mode,
"failure_policy": failure_policy,
"batch_insert_size": _resolve_batch_insert_size(),
}
)
bind_host, bind_warning = _resolve_bind_host()
bind_port = _resolve_port()
if bind_warning:
print(f"Bind host notice: {bind_warning}")
print(
"Starting HyperView runtime with "
f"startup_mode={startup_mode} failure_policy={failure_policy} "
f"status_path={warmup_status_path} bind_host={bind_host} bind_port={bind_port} "
f"(SPACE_HOST={SPACE_HOST!r}, SPACE_PORT={os.environ.get('SPACE_PORT')!r}, "
f"PORT={os.environ.get('PORT')!r})"
)
if os.environ.get("HYPERVIEW_DEMO_PREP_ONLY") == "1":
_run_warmup_blocking(dataset, tracker)
print("Preparation-only mode enabled; skipping server launch.")
return
if startup_mode == "blocking":
_run_warmup_blocking(dataset, tracker)
session = _start_server_session(dataset, bind_host=bind_host, bind_port=bind_port)
_serve_forever(session)
return
failure_event = threading.Event()
failure_holder: dict[str, str] = {}
warmup_thread = threading.Thread(
target=_warmup_worker,
name="hyperview-warmup",
args=(dataset, tracker, failure_policy, failure_event, failure_holder),
daemon=True,
)
warmup_thread.start()
print("Warmup thread started in background.")
session = _start_server_session(dataset, bind_host=bind_host, bind_port=bind_port)
if failure_policy == "exit":
_serve_forever(session, failure_event=failure_event, failure_holder=failure_holder)
else:
_serve_forever(session)
if __name__ == "__main__":
main()