techfreakworm commited on
Commit
14fcab5
·
unverified ·
1 Parent(s): af37c06

feat(spaces): mirror build-time HF cache to runtime-writable tree

Browse files

preload_from_hub populates ~/.cache/huggingface/ as the build user,
leaving it read-only for runtime uid 1000. Lazy hf_hub_download calls
for non-preloaded files (GGUF, camera LoRAs) failed with "Permission
denied". chmod couldn't help — wrong inode owner.

_mirror_preload_hf_cache() walks the preload tree once at bootstrap,
building a parallel ~/hf-cache-rw/ that we own:

- blobs/<sha>: hardlinked (zero-copy, shared inode, instant reads).
Falls back to symlink on EXDEV (cross-device).
- snapshots/<commit>/<file>: relative symlinks preserved. Targets
resolve within the mirror tree, no extra walking.
- refs/<branch>: byte-copied — HF lib overwrites these on etag check,
hardlinks would fail there.
- everything else: byte-copied (safest default for unknown files).

Sets HF_HOME + HF_HUB_CACHE so HF lib reads/writes through the mirror.
After this:
- preloaded files: instant cache hit (no network)
- new lazy downloads: write to dirs we created → no permission errors

Tested locally on a synthetic preload tree: hardlinks shared inode,
snapshot symlinks resolved correctly, refs/main was overwritable, new
files in both preloaded and entirely-new repos wrote successfully.

Files changed (2) hide show
  1. app.py +93 -17
  2. docs/future_improvements.md +19 -0
app.py CHANGED
@@ -63,6 +63,91 @@ def _git_clone(url: str, dst: pathlib.Path, ref: str) -> None:
63
  subprocess.check_call(["git", "-C", str(dst), "checkout", "-q", "FETCH_HEAD"])
64
 
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  def _bootstrap() -> None:
67
  on_spaces = _on_spaces()
68
  # /data requires the paid persistent-storage add-on (separate from Pro).
@@ -95,24 +180,15 @@ def _bootstrap() -> None:
95
  sys.path.insert(0, str(comfy_dir))
96
  os.environ.setdefault("COMFY_MODELS_DIR", str(comfy_dir / "models"))
97
 
98
- # Make the HF cache writable: preload_from_hub populates ~/.cache/huggingface
99
- # during build, which can leave it in a state that blocks runtime
100
- # hf_hub_download writes (e.g., xet's lockdir, blob targets). chmod -R u+rwX
101
- # so any model NOT covered by preload (camera LoRAs, conditional GGUF, etc.)
102
- # can still lazy-download. Failures here are non-fatal.
 
 
103
  if on_spaces:
104
- import subprocess
105
-
106
- hf_cache = pathlib.Path.home() / ".cache" / "huggingface"
107
- if hf_cache.exists():
108
- try:
109
- subprocess.run(
110
- ["chmod", "-R", "u+rwX", str(hf_cache)],
111
- check=False,
112
- timeout=30,
113
- )
114
- except Exception as exc:
115
- print(f"[bootstrap] hf cache chmod skipped: {exc}", flush=True)
116
 
117
  # Stage placeholder input files so the workflow's hard-referenced loaders
118
  # (LoadImage/VHS_Load*) don't error at runtime even when the active mode
 
63
  subprocess.check_call(["git", "-C", str(dst), "checkout", "-q", "FETCH_HEAD"])
64
 
65
 
66
+ def _mirror_preload_hf_cache() -> None:
67
+ """Mirror the build-populated HF cache into a writable runtime tree.
68
+
69
+ HF Spaces' build pipeline runs `preload_from_hub` as a different user
70
+ than the runtime container, so the populated `~/.cache/huggingface/`
71
+ is read-only for us (uid 1000). Any subsequent `hf_hub_download` call
72
+ that needs to write a NEW file (lazy-loaded LoRAs, GGUF, etc.) fails
73
+ with "Permission denied" because the parent dir isn't writable.
74
+
75
+ Fix: build a parallel tree at `~/hf-cache-rw/` that we own, with:
76
+ - dirs: created fresh via mkdir
77
+ - blob files (`blobs/<sha>`): hardlinked (shared inode, instant)
78
+ - relative snapshot symlinks: preserved as symlinks
79
+ - `refs/<branch>` files: byte-copied (HF lib overwrites these)
80
+ - everything else: byte-copied (safest default)
81
+ Then set HF_HOME / HF_HUB_CACHE so HF lib reads/writes through the
82
+ mirror. Reads are zero-copy via hardlink/symlink; new downloads land
83
+ in dirs we created.
84
+ """
85
+ import shutil
86
+
87
+ src_root = pathlib.Path.home() / ".cache" / "huggingface"
88
+ dst_root = pathlib.Path.home() / "hf-cache-rw"
89
+ dst_root.mkdir(parents=True, exist_ok=True)
90
+ os.environ["HF_HOME"] = str(dst_root)
91
+ os.environ["HF_HUB_CACHE"] = str(dst_root / "hub")
92
+
93
+ if not src_root.exists():
94
+ return
95
+
96
+ counts = {"dirs": 0, "hardlinks": 0, "symlinks": 0, "copies": 0, "errors": 0}
97
+
98
+ def _treat_as_copy(rel_path: pathlib.PurePath) -> bool:
99
+ # Anything under a refs/ dir, anywhere in the tree.
100
+ return any(part == "refs" for part in rel_path.parts)
101
+
102
+ def _walk(s: pathlib.Path, d: pathlib.Path) -> None:
103
+ try:
104
+ d.mkdir(parents=True, exist_ok=True)
105
+ counts["dirs"] += 1
106
+ except OSError as exc:
107
+ print(f"[bootstrap] mirror mkdir fail {d}: {exc}", flush=True)
108
+ counts["errors"] += 1
109
+ return
110
+
111
+ for entry in s.iterdir():
112
+ de = d / entry.name
113
+ try:
114
+ if entry.is_symlink():
115
+ if de.exists() or de.is_symlink():
116
+ continue
117
+ target = os.readlink(str(entry))
118
+ de.symlink_to(target)
119
+ counts["symlinks"] += 1
120
+ elif entry.is_dir():
121
+ _walk(entry, de)
122
+ elif entry.is_file():
123
+ if de.exists():
124
+ continue
125
+ rel = de.relative_to(dst_root)
126
+ if _treat_as_copy(rel):
127
+ shutil.copy2(entry, de)
128
+ counts["copies"] += 1
129
+ else:
130
+ try:
131
+ os.link(str(entry), str(de))
132
+ counts["hardlinks"] += 1
133
+ except OSError:
134
+ # Cross-device or other — fall back to symlink.
135
+ de.symlink_to(entry)
136
+ counts["symlinks"] += 1
137
+ except OSError as exc:
138
+ print(f"[bootstrap] mirror skip {entry}: {exc}", flush=True)
139
+ counts["errors"] += 1
140
+
141
+ _walk(src_root, dst_root)
142
+ print(
143
+ f"[bootstrap] hf cache mirrored to {dst_root}: "
144
+ f"{counts['dirs']} dirs, {counts['hardlinks']} hardlinks, "
145
+ f"{counts['symlinks']} symlinks, {counts['copies']} copies, "
146
+ f"{counts['errors']} errors",
147
+ flush=True,
148
+ )
149
+
150
+
151
  def _bootstrap() -> None:
152
  on_spaces = _on_spaces()
153
  # /data requires the paid persistent-storage add-on (separate from Pro).
 
180
  sys.path.insert(0, str(comfy_dir))
181
  os.environ.setdefault("COMFY_MODELS_DIR", str(comfy_dir / "models"))
182
 
183
+ # Mirror the build-time HF cache (populated by preload_from_hub, owned by
184
+ # build user read-only for runtime user 1000) into a writable parallel
185
+ # tree under $HOME, then point HF_HUB_CACHE / HF_HOME at it. After this:
186
+ # - preloaded blobs are accessible via hardlink (no data copy, instant reads)
187
+ # - relative snapshot symlinks resolve within the mirror
188
+ # - refs/* are byte-copies so HF lib can overwrite when commits advance
189
+ # - new lazy-downloaded files write to dirs we own → no permission errors
190
  if on_spaces:
191
+ _mirror_preload_hf_cache()
 
 
 
 
 
 
 
 
 
 
 
192
 
193
  # Stage placeholder input files so the workflow's hard-referenced loaders
194
  # (LoadImage/VHS_Load*) don't error at runtime even when the active mode
docs/future_improvements.md CHANGED
@@ -6,6 +6,25 @@ not commitment.
6
 
7
  ## Spaces / preload
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  ### ~~1. Stop preloading models that aren't referenced by any workflow~~ — DONE 2026-05-02
10
 
11
  Audit on 2026-05-02 showed two `Lightricks/LTX-2.3` files in `preload_from_hub`
 
6
 
7
  ## Spaces / preload
8
 
9
+ ### ~~0. Re-enable `preload_from_hub` via runtime cache mirror~~ — DONE 2026-05-02
10
+
11
+ Initial preload deployment failed because HF's build pipeline writes
12
+ `~/.cache/huggingface/` as the build user, leaving it read-only for runtime
13
+ user 1000. Lazy `hf_hub_download` for non-preloaded files (GGUF, camera LoRAs)
14
+ failed with `Permission denied (os error 13)`. `chmod` couldn't help — we
15
+ don't own the inode.
16
+
17
+ Fix landed in `_bootstrap()`'s `_mirror_preload_hf_cache()`:
18
+ - Walks `~/.cache/huggingface/` to a parallel `~/hf-cache-rw/` we own
19
+ - Hardlinks `blobs/<sha>` files (zero-copy, shared inode, instant reads)
20
+ - Preserves relative snapshot symlinks (resolve within the mirror tree)
21
+ - Byte-copies `refs/<branch>` files (HF lib overwrites these on etag check)
22
+ - Sets `HF_HOME` + `HF_HUB_CACHE` to the mirror so HF lib uses our writable copy
23
+ - Falls back to symlink if `os.link()` returns EXDEV (cross-device)
24
+
25
+ Result: preloaded files are instantly available (cache hit on first generate),
26
+ non-preloaded files lazy-download into dirs we own (no permission errors).
27
+
28
  ### ~~1. Stop preloading models that aren't referenced by any workflow~~ — DONE 2026-05-02
29
 
30
  Audit on 2026-05-02 showed two `Lightricks/LTX-2.3` files in `preload_from_hub`