Spaces:

owenisas
/

stable-audio-3-lab

Running on Zero

App Files Files Community

owenisas commited on 2 days ago

Commit

68da4a5

verified ·

1 Parent(s): 04f9cd3

Add runtime cache metadata and access check cache

Browse files

Files changed (2) hide show

README.md +12 -0
app.py +39 -11

README.md CHANGED Viewed

@@ -56,3 +56,15 @@ Stability AI's public MIT-licensed repository because its package metadata pins
 Torch 2.7.1. ZeroGPU currently provides Torch 2.8.0, so installing the upstream
 package through normal dependency resolution would downgrade Torch and break the
 ZeroGPU runtime.

 Torch 2.7.1. ZeroGPU currently provides Torch 2.8.0, so installing the upstream
 package through normal dependency resolution would downgrade Torch and break the
 ZeroGPU runtime.
+## Optimization notes
+- Repeated runs with the same selected model reuse the loaded model inside the
+  ZeroGPU worker when the worker stays warm. Run metadata includes `cache_hit`
+  and `load_elapsed_s` so this is visible.
+- Successful gated-repo access checks are cached briefly per token digest and
+  repo ID to avoid a Hugging Face `HEAD` request on every generation.
+- The `stable-audio-3-optimized` repo currently provides MLX, ONNX, and
+  TensorRT assets. This Space keeps the portable PyTorch path because the
+  TensorRT engines are prebuilt for `sm_90`, while the current ZeroGPU host is
+  a Blackwell GPU, and MLX is Apple-only.

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import gc
 import importlib
 import importlib.util
 import json
@@ -15,11 +16,13 @@ from contextlib import contextmanager
 from dataclasses import dataclass
 from typing import Any
 import gradio as gr
 import numpy as np
-os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
 def _filter_known_unraisable(unraisable):
     object_name = getattr(unraisable.object, "__qualname__", "")
@@ -171,6 +174,8 @@ COLLECTION_ROWS = [
 MODEL_CACHE: dict[str, Any] = {"key": None, "model": None}
 AE_CACHE: dict[str, Any] = {"key": None, "model": None}
 MODEL_LOAD_LOCK = threading.RLock()
@@ -246,7 +251,20 @@ def stable_audio_token_hint(model: GenerationModel) -> str:
     )
 def user_can_download_gated_model(repo_id: str, token: str) -> tuple[bool, str | None]:
     request = urllib.request.Request(
         f"https://huggingface.co/{repo_id}/resolve/main/model_config.json",
         method="HEAD",
@@ -254,7 +272,10 @@ def user_can_download_gated_model(repo_id: str, token: str) -> tuple[bool, str |
     )
     try:
         with urllib.request.urlopen(request, timeout=20) as response:
-            return response.status < 400, None
     except urllib.error.HTTPError as exc:
         if exc.code in {401, 403}:
             return (
@@ -389,12 +410,13 @@ def load_generation_model(
     )
     if MODEL_CACHE["key"] == model_key and MODEL_CACHE["model"] is not None:
-        return MODEL_CACHE["model"], device
     with MODEL_LOAD_LOCK:
         if MODEL_CACHE["key"] == model_key and MODEL_CACHE["model"] is not None:
-            return MODEL_CACHE["model"], device
         MODEL_CACHE["model"] = None
         MODEL_CACHE["key"] = None
         clear_torch_memory()
@@ -406,7 +428,7 @@ def load_generation_model(
             model = StableAudioModel.from_pretrained(model_key, model_half=model_half)
         MODEL_CACHE["key"] = model_key
         MODEL_CACHE["model"] = model
-        return model, device
 def load_autoencoder(
@@ -429,12 +451,13 @@ def load_autoencoder(
         )
     if AE_CACHE["key"] == model_key and AE_CACHE["model"] is not None:
-        return AE_CACHE["model"], device
     with MODEL_LOAD_LOCK:
         if AE_CACHE["key"] == model_key and AE_CACHE["model"] is not None:
-            return AE_CACHE["model"], device
         AE_CACHE["model"] = None
         AE_CACHE["key"] = None
         clear_torch_memory()
@@ -445,7 +468,7 @@ def load_autoencoder(
             model = AutoencoderModel.from_pretrained(model_key)
         AE_CACHE["key"] = model_key
         AE_CACHE["model"] = model
-        return model, device
 def model_changed(model_key: str):
@@ -521,7 +544,7 @@ def generate_audio(
     if seed < 0:
         seed = int.from_bytes(os.urandom(4), "little") % 100000
-    model, device = load_generation_model(
         model_key,
         allow_cpu_medium,
         oauth_profile,
@@ -563,6 +586,8 @@ def generate_audio(
         "seed": seed,
         "sample_rate": sample_rate,
         "elapsed_s": elapsed,
         "output_file": out_file.name,
         "note": model_def.note,
         "auth_source": auth_source(oauth_profile, oauth_token, hf_api_token),
@@ -619,7 +644,7 @@ def roundtrip_autoencoder(
     progress(0.05, desc="Loading autoencoder")
     started = time.time()
-    model, device = load_autoencoder(
         model_key,
         allow_cpu_same_l,
         oauth_profile,
@@ -655,6 +680,8 @@ def roundtrip_autoencoder(
         "input_shape": list(waveform.shape),
         "latent_shape": list(latents.shape),
         "elapsed_s": round(time.time() - started, 3),
         "output_file": out_file.name,
         "auth_source": auth_source(oauth_profile, oauth_token, hf_api_token),
         "username": oauth_username(oauth_profile),
@@ -710,6 +737,7 @@ def runtime_status(
         "hf_api_token_present": bool(hf_api_token_value(hf_api_token)),
         "loaded_generation_model": MODEL_CACHE["key"],
         "loaded_autoencoder": AE_CACHE["key"],
     }

 from __future__ import annotations
 import gc
+import hashlib
 import importlib
 import importlib.util
 import json
 from dataclasses import dataclass
 from typing import Any
+os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
+os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
+os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
 import gradio as gr
 import numpy as np
 def _filter_known_unraisable(unraisable):
     object_name = getattr(unraisable.object, "__qualname__", "")
 MODEL_CACHE: dict[str, Any] = {"key": None, "model": None}
 AE_CACHE: dict[str, Any] = {"key": None, "model": None}
+ACCESS_CACHE: dict[tuple[str, str], float] = {}
+ACCESS_CACHE_TTL_SECONDS = max(0, int(os.getenv("SA3_ACCESS_CACHE_TTL_SECONDS", "600")))
 MODEL_LOAD_LOCK = threading.RLock()
     )
+def access_cache_key(repo_id: str, token: str) -> tuple[str, str]:
+    token_digest = hashlib.sha256(token.encode("utf-8")).hexdigest()[:16]
+    return repo_id, token_digest
 def user_can_download_gated_model(repo_id: str, token: str) -> tuple[bool, str | None]:
+    cache_key = access_cache_key(repo_id, token)
+    cached_until = ACCESS_CACHE.get(cache_key)
+    now = time.time()
+    if cached_until is not None:
+        if cached_until > now:
+            return True, None
+        ACCESS_CACHE.pop(cache_key, None)
     request = urllib.request.Request(
         f"https://huggingface.co/{repo_id}/resolve/main/model_config.json",
         method="HEAD",
     )
     try:
         with urllib.request.urlopen(request, timeout=20) as response:
+            has_access = response.status < 400
+            if has_access and ACCESS_CACHE_TTL_SECONDS:
+                ACCESS_CACHE[cache_key] = time.time() + ACCESS_CACHE_TTL_SECONDS
+            return has_access, None
     except urllib.error.HTTPError as exc:
         if exc.code in {401, 403}:
             return (
     )
     if MODEL_CACHE["key"] == model_key and MODEL_CACHE["model"] is not None:
+        return MODEL_CACHE["model"], device, True, 0.0
     with MODEL_LOAD_LOCK:
         if MODEL_CACHE["key"] == model_key and MODEL_CACHE["model"] is not None:
+            return MODEL_CACHE["model"], device, True, 0.0
+        load_started = time.time()
         MODEL_CACHE["model"] = None
         MODEL_CACHE["key"] = None
         clear_torch_memory()
             model = StableAudioModel.from_pretrained(model_key, model_half=model_half)
         MODEL_CACHE["key"] = model_key
         MODEL_CACHE["model"] = model
+        return model, device, False, round(time.time() - load_started, 3)
 def load_autoencoder(
         )
     if AE_CACHE["key"] == model_key and AE_CACHE["model"] is not None:
+        return AE_CACHE["model"], device, True, 0.0
     with MODEL_LOAD_LOCK:
         if AE_CACHE["key"] == model_key and AE_CACHE["model"] is not None:
+            return AE_CACHE["model"], device, True, 0.0
+        load_started = time.time()
         AE_CACHE["model"] = None
         AE_CACHE["key"] = None
         clear_torch_memory()
             model = AutoencoderModel.from_pretrained(model_key)
         AE_CACHE["key"] = model_key
         AE_CACHE["model"] = model
+        return model, device, False, round(time.time() - load_started, 3)
 def model_changed(model_key: str):
     if seed < 0:
         seed = int.from_bytes(os.urandom(4), "little") % 100000
+    model, device, cache_hit, load_elapsed = load_generation_model(
         model_key,
         allow_cpu_medium,
         oauth_profile,
         "seed": seed,
         "sample_rate": sample_rate,
         "elapsed_s": elapsed,
+        "cache_hit": cache_hit,
+        "load_elapsed_s": load_elapsed,
         "output_file": out_file.name,
         "note": model_def.note,
         "auth_source": auth_source(oauth_profile, oauth_token, hf_api_token),
     progress(0.05, desc="Loading autoencoder")
     started = time.time()
+    model, device, cache_hit, load_elapsed = load_autoencoder(
         model_key,
         allow_cpu_same_l,
         oauth_profile,
         "input_shape": list(waveform.shape),
         "latent_shape": list(latents.shape),
         "elapsed_s": round(time.time() - started, 3),
+        "cache_hit": cache_hit,
+        "load_elapsed_s": load_elapsed,
         "output_file": out_file.name,
         "auth_source": auth_source(oauth_profile, oauth_token, hf_api_token),
         "username": oauth_username(oauth_profile),
         "hf_api_token_present": bool(hf_api_token_value(hf_api_token)),
         "loaded_generation_model": MODEL_CACHE["key"],
         "loaded_autoencoder": AE_CACHE["key"],
+        "access_cache_entries": len(ACCESS_CACHE),
     }