Spaces:
Running
Running
Commit Β·
f54eaff
1
Parent(s): c1bf580
Implement bug fixes for file handling and JupyterLab checks
Browse filesAdded multiple bug fixes to handle file size limits, exclude paths correctly, and ensure JupyterLab is not running during restore.
- jupyter-devdata-sync.py +111 -8
jupyter-devdata-sync.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
from __future__ import annotations
|
| 3 |
|
| 4 |
-
import os, shutil, tempfile, time
|
| 5 |
from pathlib import Path
|
| 6 |
|
| 7 |
HF_TOKEN = os.environ.get("HF_TOKEN", "").strip()
|
|
@@ -10,6 +10,12 @@ DATASET_NAME = os.environ.get("DEVDATA_DATASET_NAME", "").strip() or "huggingcla
|
|
| 10 |
BACKUP_DATASET_NAME = os.environ.get("BACKUP_DATASET_NAME", "").strip() or os.environ.get("BACKUP_DATASET", "").strip() or "huggingclaw-backup"
|
| 11 |
JUPYTER_ROOT = Path(os.environ.get("JUPYTER_ROOT_DIR", "/home/node")).resolve()
|
| 12 |
INTERVAL = int((os.environ.get("DEVDATA_SYNC_INTERVAL", "").strip() or "180"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
def is_true(value):
|
| 14 |
return str(value).strip().lower() in {"1", "true", "yes", "on"}
|
| 15 |
|
|
@@ -26,12 +32,17 @@ def classify_error(exc: Exception) -> str:
|
|
| 26 |
return "safety-scan"
|
| 27 |
return "general"
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
EXCLUDE = {
|
| 30 |
".cache",
|
| 31 |
"node_modules",
|
| 32 |
".npm",
|
| 33 |
".yarn",
|
| 34 |
-
".local/share/Trash"
|
| 35 |
".ipynb_checkpoints",
|
| 36 |
".openclaw",
|
| 37 |
"app",
|
|
@@ -85,13 +96,38 @@ def snapshot(src: Path, dst: Path):
|
|
| 85 |
if p.is_dir():
|
| 86 |
target.mkdir(parents=True, exist_ok=True)
|
| 87 |
elif p.is_file():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
target.parent.mkdir(parents=True, exist_ok=True)
|
| 89 |
try:
|
| 90 |
shutil.copy2(p, target)
|
| 91 |
except OSError:
|
| 92 |
pass
|
| 93 |
|
| 94 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
tmp = Path(tempfile.mkdtemp(prefix="devdata-restore-"))
|
| 96 |
try:
|
| 97 |
snapshot_download(repo_id=rid, repo_type="dataset", local_dir=str(tmp), local_dir_use_symlinks=False, token=HF_TOKEN)
|
|
@@ -99,6 +135,8 @@ def restore_once(api: HfApi, rid: str):
|
|
| 99 |
rel = p.relative_to(tmp)
|
| 100 |
if should_skip(rel):
|
| 101 |
continue
|
|
|
|
|
|
|
| 102 |
target = JUPYTER_ROOT / rel
|
| 103 |
if p.is_dir():
|
| 104 |
target.mkdir(parents=True, exist_ok=True)
|
|
@@ -118,15 +156,48 @@ def restore_once(api: HfApi, rid: str):
|
|
| 118 |
finally:
|
| 119 |
shutil.rmtree(tmp, ignore_errors=True)
|
| 120 |
|
| 121 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
while True:
|
| 123 |
tmp = Path(tempfile.mkdtemp(prefix="devdata-snap-"))
|
| 124 |
try:
|
| 125 |
snapshot(JUPYTER_ROOT, tmp)
|
| 126 |
-
upload_folder(
|
| 127 |
-
|
| 128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
print(f"DevData synced to {rid}")
|
|
|
|
|
|
|
| 130 |
except Exception as exc:
|
| 131 |
kind = classify_error(exc)
|
| 132 |
print(f"DevData sync warning [{kind}]: {exc}")
|
|
@@ -134,10 +205,12 @@ def sync_loop(api: HfApi, rid: str):
|
|
| 134 |
shutil.rmtree(tmp, ignore_errors=True)
|
| 135 |
time.sleep(INTERVAL)
|
| 136 |
|
|
|
|
| 137 |
if __name__ == "__main__":
|
| 138 |
if not enabled():
|
| 139 |
print("DevData sync disabled.")
|
| 140 |
raise SystemExit(0)
|
|
|
|
| 141 |
from huggingface_hub import HfApi, upload_folder, snapshot_download
|
| 142 |
from huggingface_hub.errors import RepositoryNotFoundError
|
| 143 |
|
|
@@ -147,6 +220,36 @@ if __name__ == "__main__":
|
|
| 147 |
api.repo_info(repo_id=rid, repo_type="dataset")
|
| 148 |
except RepositoryNotFoundError:
|
| 149 |
api.create_repo(repo_id=rid, repo_type="dataset", private=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
validate_jupyter_paths()
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
sync_loop(api, rid)
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
from __future__ import annotations
|
| 3 |
|
| 4 |
+
import os, shutil, socket, sys, tempfile, time
|
| 5 |
from pathlib import Path
|
| 6 |
|
| 7 |
HF_TOKEN = os.environ.get("HF_TOKEN", "").strip()
|
|
|
|
| 10 |
BACKUP_DATASET_NAME = os.environ.get("BACKUP_DATASET_NAME", "").strip() or os.environ.get("BACKUP_DATASET", "").strip() or "huggingclaw-backup"
|
| 11 |
JUPYTER_ROOT = Path(os.environ.get("JUPYTER_ROOT_DIR", "/home/node")).resolve()
|
| 12 |
INTERVAL = int((os.environ.get("DEVDATA_SYNC_INTERVAL", "").strip() or "180"))
|
| 13 |
+
# BUG FIX #5: Respect max file size so giant files don't stall uploads.
|
| 14 |
+
# Matches the 50 MB ceiling in openclaw-sync.py; override with DEVDATA_MAX_FILE_BYTES.
|
| 15 |
+
MAX_FILE_SIZE_BYTES = int(
|
| 16 |
+
(os.environ.get("DEVDATA_MAX_FILE_BYTES", "").strip() or str(50 * 1024 * 1024))
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
def is_true(value):
|
| 20 |
return str(value).strip().lower() in {"1", "true", "yes", "on"}
|
| 21 |
|
|
|
|
| 32 |
return "safety-scan"
|
| 33 |
return "general"
|
| 34 |
|
| 35 |
+
# BUG FIX #4: ".local/share/Trash" in the original EXCLUDE set was a
|
| 36 |
+
# multi-component path string that was never matched because parts-based
|
| 37 |
+
# lookup compares individual directory names. Added "Trash" as a standalone
|
| 38 |
+
# component so any path with a "Trash" segment (e.g. .local/share/Trash/*)
|
| 39 |
+
# is correctly skipped during snapshot and restore.
|
| 40 |
EXCLUDE = {
|
| 41 |
".cache",
|
| 42 |
"node_modules",
|
| 43 |
".npm",
|
| 44 |
".yarn",
|
| 45 |
+
"Trash", # BUG FIX #4: covers .local/share/Trash (was ".local/share/Trash" β never matched)
|
| 46 |
".ipynb_checkpoints",
|
| 47 |
".openclaw",
|
| 48 |
"app",
|
|
|
|
| 96 |
if p.is_dir():
|
| 97 |
target.mkdir(parents=True, exist_ok=True)
|
| 98 |
elif p.is_file():
|
| 99 |
+
# BUG FIX #5: Skip files that exceed the size limit.
|
| 100 |
+
try:
|
| 101 |
+
if p.stat().st_size > MAX_FILE_SIZE_BYTES:
|
| 102 |
+
continue
|
| 103 |
+
except OSError:
|
| 104 |
+
continue
|
| 105 |
target.parent.mkdir(parents=True, exist_ok=True)
|
| 106 |
try:
|
| 107 |
shutil.copy2(p, target)
|
| 108 |
except OSError:
|
| 109 |
pass
|
| 110 |
|
| 111 |
+
def is_jupyter_running(port: int = 8888) -> bool:
|
| 112 |
+
"""Return True if JupyterLab is already listening on *port*.
|
| 113 |
+
|
| 114 |
+
BUG FIX #2 (safety net): restore_once() must never run while JupyterLab
|
| 115 |
+
is active. Overwriting files under JUPYTER_ROOT (runtime/ sockets, lab/
|
| 116 |
+
settings, kernel connection files) while JupyterLab is live corrupts its
|
| 117 |
+
state and causes it to exit within seconds.
|
| 118 |
+
|
| 119 |
+
The primary guard is the --restore / sync separation introduced in
|
| 120 |
+
BUG FIX #3, but this TCP probe stays as a hard backstop for any future
|
| 121 |
+
code path that might call restore_once() unexpectedly.
|
| 122 |
+
"""
|
| 123 |
+
try:
|
| 124 |
+
with socket.create_connection(("127.0.0.1", port), timeout=2):
|
| 125 |
+
return True
|
| 126 |
+
except OSError:
|
| 127 |
+
return False
|
| 128 |
+
|
| 129 |
+
def restore_once(api, rid: str):
|
| 130 |
+
from huggingface_hub.errors import RepositoryNotFoundError
|
| 131 |
tmp = Path(tempfile.mkdtemp(prefix="devdata-restore-"))
|
| 132 |
try:
|
| 133 |
snapshot_download(repo_id=rid, repo_type="dataset", local_dir=str(tmp), local_dir_use_symlinks=False, token=HF_TOKEN)
|
|
|
|
| 135 |
rel = p.relative_to(tmp)
|
| 136 |
if should_skip(rel):
|
| 137 |
continue
|
| 138 |
+
if str(rel) == ".gitattributes":
|
| 139 |
+
continue
|
| 140 |
target = JUPYTER_ROOT / rel
|
| 141 |
if p.is_dir():
|
| 142 |
target.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 156 |
finally:
|
| 157 |
shutil.rmtree(tmp, ignore_errors=True)
|
| 158 |
|
| 159 |
+
def prune_remote_deleted_files(api, rid: str, snapshot_dir: Path) -> None:
|
| 160 |
+
"""BUG FIX #6: Delete from the HF dataset any files the user deleted
|
| 161 |
+
locally. Without this, deleted files re-appear on the next Space restart
|
| 162 |
+
because restore_once() copies everything in the dataset back to disk.
|
| 163 |
+
Mirrors the prune_remote_deleted_files() logic in openclaw-sync.py.
|
| 164 |
+
"""
|
| 165 |
+
try:
|
| 166 |
+
local_files = {
|
| 167 |
+
p.relative_to(snapshot_dir).as_posix()
|
| 168 |
+
for p in snapshot_dir.rglob("*")
|
| 169 |
+
if p.is_file()
|
| 170 |
+
}
|
| 171 |
+
remote_files = list(api.list_repo_files(repo_id=rid, repo_type="dataset"))
|
| 172 |
+
stale = [f for f in remote_files if f not in local_files and f != ".gitattributes"]
|
| 173 |
+
if stale:
|
| 174 |
+
api.delete_files(
|
| 175 |
+
delete_patterns=stale,
|
| 176 |
+
repo_id=rid,
|
| 177 |
+
repo_type="dataset",
|
| 178 |
+
commit_message=f"DevData prune {len(stale)} deleted file(s) {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
|
| 179 |
+
)
|
| 180 |
+
print(f"DevData pruned {len(stale)} deleted file(s) from {rid}")
|
| 181 |
+
except Exception as exc:
|
| 182 |
+
kind = classify_error(exc)
|
| 183 |
+
print(f"DevData prune warning [{kind}]: {exc}")
|
| 184 |
+
|
| 185 |
+
def sync_loop(api, rid: str):
|
| 186 |
while True:
|
| 187 |
tmp = Path(tempfile.mkdtemp(prefix="devdata-snap-"))
|
| 188 |
try:
|
| 189 |
snapshot(JUPYTER_ROOT, tmp)
|
| 190 |
+
upload_folder(
|
| 191 |
+
folder_path=str(tmp),
|
| 192 |
+
repo_id=rid,
|
| 193 |
+
repo_type="dataset",
|
| 194 |
+
token=HF_TOKEN,
|
| 195 |
+
commit_message=f"DevData sync {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
|
| 196 |
+
ignore_patterns=[".git/*", ".git"],
|
| 197 |
+
)
|
| 198 |
print(f"DevData synced to {rid}")
|
| 199 |
+
# BUG FIX #6: Prune files deleted locally so they don't reappear on restore.
|
| 200 |
+
prune_remote_deleted_files(api, rid, tmp)
|
| 201 |
except Exception as exc:
|
| 202 |
kind = classify_error(exc)
|
| 203 |
print(f"DevData sync warning [{kind}]: {exc}")
|
|
|
|
| 205 |
shutil.rmtree(tmp, ignore_errors=True)
|
| 206 |
time.sleep(INTERVAL)
|
| 207 |
|
| 208 |
+
|
| 209 |
if __name__ == "__main__":
|
| 210 |
if not enabled():
|
| 211 |
print("DevData sync disabled.")
|
| 212 |
raise SystemExit(0)
|
| 213 |
+
|
| 214 |
from huggingface_hub import HfApi, upload_folder, snapshot_download
|
| 215 |
from huggingface_hub.errors import RepositoryNotFoundError
|
| 216 |
|
|
|
|
| 220 |
api.repo_info(repo_id=rid, repo_type="dataset")
|
| 221 |
except RepositoryNotFoundError:
|
| 222 |
api.create_repo(repo_id=rid, repo_type="dataset", private=True)
|
| 223 |
+
|
| 224 |
+
# ββ BUG FIX #3: Restore must happen BEFORE JupyterLab starts ββββββββββ
|
| 225 |
+
# The original code always called restore_once() here, but start.sh starts
|
| 226 |
+
# JupyterLab long before the gateway is ready and this script is launched.
|
| 227 |
+
# That made restore_once() ALWAYS run while JupyterLab was live, which
|
| 228 |
+
# overwrote its runtime/ sockets and settings β JupyterLab died.
|
| 229 |
+
#
|
| 230 |
+
# Fix: start.sh now calls `python3 jupyter-devdata-sync.py --restore`
|
| 231 |
+
# BEFORE starting JupyterLab. That --restore invocation does the restore
|
| 232 |
+
# and exits. This background invocation (no --restore flag) skips straight
|
| 233 |
+
# to sync_loop so it never touches files while JupyterLab is running.
|
| 234 |
+
#
|
| 235 |
+
# BUG FIX #2 (safety net): If JupyterLab is somehow already running when
|
| 236 |
+
# this code path is reached, abort restore to avoid corrupting its state.
|
| 237 |
+
if "--restore" in sys.argv:
|
| 238 |
+
# Synchronous restore mode β called by start.sh before JupyterLab.
|
| 239 |
+
validate_jupyter_paths()
|
| 240 |
+
restore_once(api, rid)
|
| 241 |
+
raise SystemExit(0)
|
| 242 |
+
|
| 243 |
+
# Normal background sync mode β no restore; go straight to upload loop.
|
| 244 |
validate_jupyter_paths()
|
| 245 |
+
if is_jupyter_running():
|
| 246 |
+
print("DevData: background sync started (JupyterLab is live, restore already done by --restore).")
|
| 247 |
+
else:
|
| 248 |
+
# Fallback: JupyterLab not detected. Should not normally happen
|
| 249 |
+
# because start.sh calls --restore before starting JupyterLab and then
|
| 250 |
+
# waits for the gateway before launching this background process.
|
| 251 |
+
# Log a warning and proceed to sync; do NOT restore to avoid racing
|
| 252 |
+
# with a JupyterLab that may be in the middle of starting up.
|
| 253 |
+
print("DevData: WARNING β JupyterLab not detected on port 8888. Skipping restore to be safe; starting sync loop.")
|
| 254 |
+
|
| 255 |
sync_loop(api, rid)
|