Spaces:

luh0502
/

NeAR

Running on Zero

luh1124 Claude Sonnet 4.6 commited on 12 days ago

Commit

8411b79

1 Parent(s): 1c92c96

fix(preload): match app_hyshape.py — load Hunyuan3D to CPU RAM, cache NeAR to disk

Root cause of ZeroGPU timeout: Hunyuan3D was never loaded into CPU RAM,
so the GPU callback had to instantiate 7 GB of weights within its 240 s budget.

Fix mirrors app_hyshape.py (proven working):
- Step 1 (under lock): load Hunyuan3D into CPU RAM at startup.
GPU callback only does .to("cuda") + inference — no download/load wait.
- Step 2 (no lock): snapshot_download NeAR for disk cache.
NeAR cannot be instantiated in main process (BiRefNet triggers CUDA init);
it loads lazily inside the first @GPU callback where CUDA is available.
- Restore _MODEL_LOCK in _ensure_geometry_on_cuda (released before .to("cuda")).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

app.py +25 -19

app.py CHANGED Viewed

@@ -135,27 +135,29 @@ def _load_geometry_cpu_locked() -> None:
 def _preload_worker() -> None:
-    """Pre-warm HF disk cache only — no model instantiation, no locks.
-    snapshot_download is thread-safe and idempotent; it does NOT hold
-    _MODEL_LOCK, so GPU callbacks are never blocked waiting for this thread.
-    Model instantiation happens lazily inside the first @GPU callback where
-    CUDA is available and no ZeroGPU timeout risk exists.
     """
     try:
         from huggingface_hub import snapshot_download
-        for _repo in (
-            os.environ.get("NEAR_HUNYUAN_PRETRAINED", "tencent/Hunyuan3D-2.1"),
-            "luh0502/NeAR",
-        ):
-            try:
-                print(f"[NeAR] preload: caching {_repo!r}…", flush=True)
-                snapshot_download(repo_id=_repo, token=os.environ.get("HF_TOKEN"))
-                print(f"[NeAR] preload: {_repo!r} ready.", flush=True)
-            except Exception as _exc:
-                print(f"[NeAR] preload: {_repo!r} failed: {_exc}", flush=True)
     except Exception as exc:
-        print(f"[NeAR] preload failed: {exc}", flush=True)
 # ── GPU ensure helpers ────────────────────────────────────────────────────────
@@ -163,7 +165,9 @@ def _preload_worker() -> None:
 # tone_mapper because each ZeroGPU call has a fresh CUDA context.
 def _ensure_near_on_cuda() -> NeARImageToRelightable3DPipeline:
-    # No lock: ZeroGPU runs one GPU callback at a time — no concurrent access.
     _load_near_cpu_locked()
     assert PIPELINE is not None
     if torch.cuda.is_available():
@@ -186,8 +190,10 @@ def _teardown_near() -> None:
 def _ensure_geometry_on_cuda() -> Hunyuan3DDiTFlowMatchingPipeline:
-    # No lock: ZeroGPU runs one GPU callback at a time — no concurrent access.
-    _load_geometry_cpu_locked()
     assert GEOMETRY_PIPELINE is not None
     if torch.cuda.is_available():
         GEOMETRY_PIPELINE.to("cuda")

 def _preload_worker() -> None:
+    """Mirror app_hyshape.py: load Hunyuan3D into CPU RAM under lock.
+    Hunyuan3D is safe to load in the main process (no CUDA init).
+    The GPU callback then only does .to("cuda") + inference — no download wait.
+    NeAR cannot be preloaded (BiRefNet triggers CUDA init in main process).
+    We only warm its disk cache so the GPU callback loads from disk, not network.
     """
+    # Step 1: load Hunyuan3D into CPU RAM (same pattern as app_hyshape.py).
+    try:
+        with _MODEL_LOCK:
+            _load_geometry_cpu_locked()
+        print("[NeAR] preload: Hunyuan3D in CPU RAM.", flush=True)
+    except Exception as exc:
+        print(f"[NeAR] preload: Hunyuan3D failed: {exc}", flush=True)
+    # Step 2: warm NeAR disk cache (no CUDA, no lock, no instantiation).
     try:
         from huggingface_hub import snapshot_download
+        snapshot_download(repo_id="luh0502/NeAR", token=os.environ.get("HF_TOKEN"))
+        print("[NeAR] preload: NeAR disk cache ready.", flush=True)
     except Exception as exc:
+        print(f"[NeAR] preload: NeAR disk cache failed: {exc}", flush=True)
 # ── GPU ensure helpers ────────────────────────────────────────────────────────
 # tone_mapper because each ZeroGPU call has a fresh CUDA context.
 def _ensure_near_on_cuda() -> NeARImageToRelightable3DPipeline:
+    # NeAR loads lazily here (BiRefNet/DINOv2 need CUDA available — only safe
+    # inside @GPU callback, not in main process).
+    # ZeroGPU runs one GPU callback at a time so no lock is needed.
     _load_near_cpu_locked()
     assert PIPELINE is not None
     if torch.cuda.is_available():
 def _ensure_geometry_on_cuda() -> Hunyuan3DDiTFlowMatchingPipeline:
+    # Hunyuan3D is pre-loaded into CPU RAM by _preload_worker, so this is
+    # usually a no-op. Lock is released before .to("cuda").
+    with _MODEL_LOCK:
+        _load_geometry_cpu_locked()
     assert GEOMETRY_PIPELINE is not None
     if torch.cuda.is_available():
         GEOMETRY_PIPELINE.to("cuda")