luh1124 Claude Sonnet 4.6 commited on
Commit
8411b79
·
1 Parent(s): 1c92c96

fix(preload): match app_hyshape.py — load Hunyuan3D to CPU RAM, cache NeAR to disk

Browse files

Root cause of ZeroGPU timeout: Hunyuan3D was never loaded into CPU RAM,
so the GPU callback had to instantiate 7 GB of weights within its 240 s budget.

Fix mirrors app_hyshape.py (proven working):
- Step 1 (under lock): load Hunyuan3D into CPU RAM at startup.
GPU callback only does .to("cuda") + inference — no download/load wait.
- Step 2 (no lock): snapshot_download NeAR for disk cache.
NeAR cannot be instantiated in main process (BiRefNet triggers CUDA init);
it loads lazily inside the first @GPU callback where CUDA is available.
- Restore _MODEL_LOCK in _ensure_geometry_on_cuda (released before .to("cuda")).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +25 -19
app.py CHANGED
@@ -135,27 +135,29 @@ def _load_geometry_cpu_locked() -> None:
135
 
136
 
137
  def _preload_worker() -> None:
138
- """Pre-warm HF disk cache only no model instantiation, no locks.
139
 
140
- snapshot_download is thread-safe and idempotent; it does NOT hold
141
- _MODEL_LOCK, so GPU callbacks are never blocked waiting for this thread.
142
- Model instantiation happens lazily inside the first @GPU callback where
143
- CUDA is available and no ZeroGPU timeout risk exists.
 
144
  """
 
 
 
 
 
 
 
 
 
145
  try:
146
  from huggingface_hub import snapshot_download
147
- for _repo in (
148
- os.environ.get("NEAR_HUNYUAN_PRETRAINED", "tencent/Hunyuan3D-2.1"),
149
- "luh0502/NeAR",
150
- ):
151
- try:
152
- print(f"[NeAR] preload: caching {_repo!r}…", flush=True)
153
- snapshot_download(repo_id=_repo, token=os.environ.get("HF_TOKEN"))
154
- print(f"[NeAR] preload: {_repo!r} ready.", flush=True)
155
- except Exception as _exc:
156
- print(f"[NeAR] preload: {_repo!r} failed: {_exc}", flush=True)
157
  except Exception as exc:
158
- print(f"[NeAR] preload failed: {exc}", flush=True)
159
 
160
 
161
  # ── GPU ensure helpers ────────────────────────────────────────────────────────
@@ -163,7 +165,9 @@ def _preload_worker() -> None:
163
  # tone_mapper because each ZeroGPU call has a fresh CUDA context.
164
 
165
  def _ensure_near_on_cuda() -> NeARImageToRelightable3DPipeline:
166
- # No lock: ZeroGPU runs one GPU callback at a time no concurrent access.
 
 
167
  _load_near_cpu_locked()
168
  assert PIPELINE is not None
169
  if torch.cuda.is_available():
@@ -186,8 +190,10 @@ def _teardown_near() -> None:
186
 
187
 
188
  def _ensure_geometry_on_cuda() -> Hunyuan3DDiTFlowMatchingPipeline:
189
- # No lock: ZeroGPU runs one GPU callback at a time — no concurrent access.
190
- _load_geometry_cpu_locked()
 
 
191
  assert GEOMETRY_PIPELINE is not None
192
  if torch.cuda.is_available():
193
  GEOMETRY_PIPELINE.to("cuda")
 
135
 
136
 
137
  def _preload_worker() -> None:
138
+ """Mirror app_hyshape.py: load Hunyuan3D into CPU RAM under lock.
139
 
140
+ Hunyuan3D is safe to load in the main process (no CUDA init).
141
+ The GPU callback then only does .to("cuda") + inference — no download wait.
142
+
143
+ NeAR cannot be preloaded (BiRefNet triggers CUDA init in main process).
144
+ We only warm its disk cache so the GPU callback loads from disk, not network.
145
  """
146
+ # Step 1: load Hunyuan3D into CPU RAM (same pattern as app_hyshape.py).
147
+ try:
148
+ with _MODEL_LOCK:
149
+ _load_geometry_cpu_locked()
150
+ print("[NeAR] preload: Hunyuan3D in CPU RAM.", flush=True)
151
+ except Exception as exc:
152
+ print(f"[NeAR] preload: Hunyuan3D failed: {exc}", flush=True)
153
+
154
+ # Step 2: warm NeAR disk cache (no CUDA, no lock, no instantiation).
155
  try:
156
  from huggingface_hub import snapshot_download
157
+ snapshot_download(repo_id="luh0502/NeAR", token=os.environ.get("HF_TOKEN"))
158
+ print("[NeAR] preload: NeAR disk cache ready.", flush=True)
 
 
 
 
 
 
 
 
159
  except Exception as exc:
160
+ print(f"[NeAR] preload: NeAR disk cache failed: {exc}", flush=True)
161
 
162
 
163
  # ── GPU ensure helpers ────────────────────────────────────────────────────────
 
165
  # tone_mapper because each ZeroGPU call has a fresh CUDA context.
166
 
167
  def _ensure_near_on_cuda() -> NeARImageToRelightable3DPipeline:
168
+ # NeAR loads lazily here (BiRefNet/DINOv2 need CUDA availableonly safe
169
+ # inside @GPU callback, not in main process).
170
+ # ZeroGPU runs one GPU callback at a time so no lock is needed.
171
  _load_near_cpu_locked()
172
  assert PIPELINE is not None
173
  if torch.cuda.is_available():
 
190
 
191
 
192
  def _ensure_geometry_on_cuda() -> Hunyuan3DDiTFlowMatchingPipeline:
193
+ # Hunyuan3D is pre-loaded into CPU RAM by _preload_worker, so this is
194
+ # usually a no-op. Lock is released before .to("cuda").
195
+ with _MODEL_LOCK:
196
+ _load_geometry_cpu_locked()
197
  assert GEOMETRY_PIPELINE is not None
198
  if torch.cuda.is_available():
199
  GEOMETRY_PIPELINE.to("cuda")