fix(preload): match app_hyshape.py — load Hunyuan3D to CPU RAM, cache NeAR to disk
Browse filesRoot cause of ZeroGPU timeout: Hunyuan3D was never loaded into CPU RAM,
so the GPU callback had to instantiate 7 GB of weights within its 240 s budget.
Fix mirrors app_hyshape.py (proven working):
- Step 1 (under lock): load Hunyuan3D into CPU RAM at startup.
GPU callback only does .to("cuda") + inference — no download/load wait.
- Step 2 (no lock): snapshot_download NeAR for disk cache.
NeAR cannot be instantiated in main process (BiRefNet triggers CUDA init);
it loads lazily inside the first @GPU callback where CUDA is available.
- Restore _MODEL_LOCK in _ensure_geometry_on_cuda (released before .to("cuda")).
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
app.py
CHANGED
|
@@ -135,27 +135,29 @@ def _load_geometry_cpu_locked() -> None:
|
|
| 135 |
|
| 136 |
|
| 137 |
def _preload_worker() -> None:
|
| 138 |
-
"""
|
| 139 |
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
|
|
|
| 144 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
try:
|
| 146 |
from huggingface_hub import snapshot_download
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
"luh0502/NeAR",
|
| 150 |
-
):
|
| 151 |
-
try:
|
| 152 |
-
print(f"[NeAR] preload: caching {_repo!r}…", flush=True)
|
| 153 |
-
snapshot_download(repo_id=_repo, token=os.environ.get("HF_TOKEN"))
|
| 154 |
-
print(f"[NeAR] preload: {_repo!r} ready.", flush=True)
|
| 155 |
-
except Exception as _exc:
|
| 156 |
-
print(f"[NeAR] preload: {_repo!r} failed: {_exc}", flush=True)
|
| 157 |
except Exception as exc:
|
| 158 |
-
print(f"[NeAR] preload failed: {exc}", flush=True)
|
| 159 |
|
| 160 |
|
| 161 |
# ── GPU ensure helpers ────────────────────────────────────────────────────────
|
|
@@ -163,7 +165,9 @@ def _preload_worker() -> None:
|
|
| 163 |
# tone_mapper because each ZeroGPU call has a fresh CUDA context.
|
| 164 |
|
| 165 |
def _ensure_near_on_cuda() -> NeARImageToRelightable3DPipeline:
|
| 166 |
-
#
|
|
|
|
|
|
|
| 167 |
_load_near_cpu_locked()
|
| 168 |
assert PIPELINE is not None
|
| 169 |
if torch.cuda.is_available():
|
|
@@ -186,8 +190,10 @@ def _teardown_near() -> None:
|
|
| 186 |
|
| 187 |
|
| 188 |
def _ensure_geometry_on_cuda() -> Hunyuan3DDiTFlowMatchingPipeline:
|
| 189 |
-
#
|
| 190 |
-
|
|
|
|
|
|
|
| 191 |
assert GEOMETRY_PIPELINE is not None
|
| 192 |
if torch.cuda.is_available():
|
| 193 |
GEOMETRY_PIPELINE.to("cuda")
|
|
|
|
| 135 |
|
| 136 |
|
| 137 |
def _preload_worker() -> None:
|
| 138 |
+
"""Mirror app_hyshape.py: load Hunyuan3D into CPU RAM under lock.
|
| 139 |
|
| 140 |
+
Hunyuan3D is safe to load in the main process (no CUDA init).
|
| 141 |
+
The GPU callback then only does .to("cuda") + inference — no download wait.
|
| 142 |
+
|
| 143 |
+
NeAR cannot be preloaded (BiRefNet triggers CUDA init in main process).
|
| 144 |
+
We only warm its disk cache so the GPU callback loads from disk, not network.
|
| 145 |
"""
|
| 146 |
+
# Step 1: load Hunyuan3D into CPU RAM (same pattern as app_hyshape.py).
|
| 147 |
+
try:
|
| 148 |
+
with _MODEL_LOCK:
|
| 149 |
+
_load_geometry_cpu_locked()
|
| 150 |
+
print("[NeAR] preload: Hunyuan3D in CPU RAM.", flush=True)
|
| 151 |
+
except Exception as exc:
|
| 152 |
+
print(f"[NeAR] preload: Hunyuan3D failed: {exc}", flush=True)
|
| 153 |
+
|
| 154 |
+
# Step 2: warm NeAR disk cache (no CUDA, no lock, no instantiation).
|
| 155 |
try:
|
| 156 |
from huggingface_hub import snapshot_download
|
| 157 |
+
snapshot_download(repo_id="luh0502/NeAR", token=os.environ.get("HF_TOKEN"))
|
| 158 |
+
print("[NeAR] preload: NeAR disk cache ready.", flush=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
except Exception as exc:
|
| 160 |
+
print(f"[NeAR] preload: NeAR disk cache failed: {exc}", flush=True)
|
| 161 |
|
| 162 |
|
| 163 |
# ── GPU ensure helpers ────────────────────────────────────────────────────────
|
|
|
|
| 165 |
# tone_mapper because each ZeroGPU call has a fresh CUDA context.
|
| 166 |
|
| 167 |
def _ensure_near_on_cuda() -> NeARImageToRelightable3DPipeline:
|
| 168 |
+
# NeAR loads lazily here (BiRefNet/DINOv2 need CUDA available — only safe
|
| 169 |
+
# inside @GPU callback, not in main process).
|
| 170 |
+
# ZeroGPU runs one GPU callback at a time so no lock is needed.
|
| 171 |
_load_near_cpu_locked()
|
| 172 |
assert PIPELINE is not None
|
| 173 |
if torch.cuda.is_available():
|
|
|
|
| 190 |
|
| 191 |
|
| 192 |
def _ensure_geometry_on_cuda() -> Hunyuan3DDiTFlowMatchingPipeline:
|
| 193 |
+
# Hunyuan3D is pre-loaded into CPU RAM by _preload_worker, so this is
|
| 194 |
+
# usually a no-op. Lock is released before .to("cuda").
|
| 195 |
+
with _MODEL_LOCK:
|
| 196 |
+
_load_geometry_cpu_locked()
|
| 197 |
assert GEOMETRY_PIPELINE is not None
|
| 198 |
if torch.cuda.is_available():
|
| 199 |
GEOMETRY_PIPELINE.to("cuda")
|