Spaces:
Running
Running
File size: 12,607 Bytes
0e6fb59 f54eaff 0e6fb59 f54eaff 0e6fb59 f54eaff 0e6fb59 f54eaff 0e6fb59 7baec40 0e6fb59 09e99f8 0e6fb59 09e99f8 0e6fb59 f54eaff 0e6fb59 f54eaff 0e6fb59 f54eaff 0e6fb59 f54eaff 0e6fb59 f54eaff 0e6fb59 f54eaff 0e6fb59 f54eaff 0e6fb59 f54eaff 0e6fb59 f54eaff 0e6fb59 f54eaff 0e6fb59 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 | #!/usr/bin/env python3
from __future__ import annotations
import os, shutil, socket, sys, tempfile, time
from pathlib import Path
HF_TOKEN = os.environ.get("HF_TOKEN", "").strip()
HF_USERNAME = os.environ.get("HF_USERNAME", "").strip() or os.environ.get("SPACE_AUTHOR_NAME", "").strip()
DATASET_NAME = os.environ.get("DEVDATA_DATASET_NAME", "").strip() or "huggingclaw-devdata"
BACKUP_DATASET_NAME = os.environ.get("BACKUP_DATASET_NAME", "").strip() or os.environ.get("BACKUP_DATASET", "").strip() or "huggingclaw-backup"
JUPYTER_ROOT = Path(os.environ.get("JUPYTER_ROOT_DIR", "/home/node")).resolve()
INTERVAL = int((os.environ.get("DEVDATA_SYNC_INTERVAL", "").strip() or "180"))
# BUG FIX #5: Respect max file size so giant files don't stall uploads.
# Matches the 50 MB ceiling in openclaw-sync.py; override with DEVDATA_MAX_FILE_BYTES.
MAX_FILE_SIZE_BYTES = int(
(os.environ.get("DEVDATA_MAX_FILE_BYTES", "").strip() or str(50 * 1024 * 1024))
)
def is_true(value):
return str(value).strip().lower() in {"1", "true", "yes", "on"}
ENABLE = is_true(os.environ.get("DEVDATA", "on"))
def classify_error(exc: Exception) -> str:
msg = str(exc).lower()
if isinstance(exc, PermissionError) or "permission denied" in msg:
return "filesystem-permission"
if any(k in msg for k in ("connection error", "fetch failed", "timeout", "temporarily unavailable", "network")):
return "network-provider"
if "unsafe" in msg or "malware" in msg or "security" in msg:
return "safety-scan"
return "general"
# BUG FIX #4: ".local/share/Trash" in the original EXCLUDE set was a
# multi-component path string that was never matched because parts-based
# lookup compares individual directory names. Added "Trash" as a standalone
# component so any path with a "Trash" segment (e.g. .local/share/Trash/*)
# is correctly skipped during snapshot and restore.
EXCLUDE = {
".cache",
"node_modules",
".npm",
".yarn",
"Trash", # BUG FIX #4: covers .local/share/Trash (was ".local/share/Trash" β never matched)
".ipynb_checkpoints",
".openclaw",
"app",
"HuggingClaw",
"HuggingClaw-Workspace",
"browser-deps",
# Exclude Python/system package directories β these contain thousands of files
# (e.g. .local/lib/python3.11/site-packages/) and must not be synced to the
# HF Dataset. Syncing them causes 10,000+ file fetches on every restore and
# can restore a broken jsonschema that crashes JupyterLab on boot.
".local",
"lib",
"site-packages",
"__pycache__",
}
def enabled():
dev = is_true(os.environ.get("DEV_MODE", ""))
separate_dataset = DATASET_NAME != BACKUP_DATASET_NAME
if ENABLE and dev and HF_TOKEN and not separate_dataset:
print("DevData sync disabled: DEVDATA_DATASET_NAME must be separate from BACKUP_DATASET_NAME.")
return ENABLE and dev and bool(HF_TOKEN) and separate_dataset
def validate_jupyter_paths() -> None:
# JupyterLab theme/settings live under ~/.jupyter and ~/.local/share/jupyter.
# If these are not writable, settings can appear to "reset" every restart.
for required in (JUPYTER_ROOT, Path("/home/node/.jupyter"), Path("/home/node/.local/share/jupyter")):
try:
required.mkdir(parents=True, exist_ok=True)
probe = required / ".devdata-write-check"
probe.write_text("ok", encoding="utf-8")
probe.unlink(missing_ok=True)
except Exception as exc:
kind = classify_error(exc)
print(f"DevData warning [{kind}]: {required} is not writable; Jupyter settings may not persist ({exc})")
def repo_id(api) -> str:
ns = HF_USERNAME
if not ns:
who = api.whoami()
ns = who.get("name") or who.get("user") or ""
if not ns:
raise RuntimeError("Cannot resolve HF namespace for devdata sync")
return f"{ns}/{DATASET_NAME}"
# Filename patterns that must never be synced to a public/private HF Dataset.
# These are matched against the *name* of each path component (not the full path),
# so ".env" matches /home/node/.env and /home/node/subdir/.env alike.
import fnmatch as _fnmatch
SECRET_FILENAME_PATTERNS = {
".env", # dotenv files β almost always contain API keys
".env.*", # .env.local, .env.production, etc.
"*secret*", # any file/dir whose name contains "secret"
"*secrets*",
"*_secret*",
"*-secret*",
"*key*", # private keys, API key files
"*_key*",
"*-key*",
"*token*", # token files
"*_token*",
"*-token*",
"*.pem", # TLS/SSH private keys
"*.key", # generic key files
"*.p12", # PKCS#12 bundles
"*.pfx",
"credentials", # common credential file names
"credentials.*",
".netrc", # stores plaintext passwords
".htpasswd",
}
def _name_is_secret(name: str) -> bool:
"""Return True if *name* matches any secret-exclusion pattern."""
name_lower = name.lower()
return any(_fnmatch.fnmatch(name_lower, pat) for pat in SECRET_FILENAME_PATTERNS)
def should_skip(p: Path):
# Skip directories/files in the hard-coded exclude set.
parts = p.parts
if any(x in parts for x in EXCLUDE):
return True
# Skip any component whose name looks like a secret file/dir.
return any(_name_is_secret(part) for part in parts)
def snapshot(src: Path, dst: Path):
for p in src.rglob("*"):
rel = p.relative_to(src)
if should_skip(rel):
continue
if p.is_symlink():
continue
target = dst / rel
if p.is_dir():
target.mkdir(parents=True, exist_ok=True)
elif p.is_file():
# BUG FIX #5: Skip files that exceed the size limit.
try:
if p.stat().st_size > MAX_FILE_SIZE_BYTES:
continue
except OSError:
continue
target.parent.mkdir(parents=True, exist_ok=True)
try:
shutil.copy2(p, target)
except OSError:
pass
def is_jupyter_running(port: int = 8888) -> bool:
"""Return True if JupyterLab is already listening on *port*.
BUG FIX #2 (safety net): restore_once() must never run while JupyterLab
is active. Overwriting files under JUPYTER_ROOT (runtime/ sockets, lab/
settings, kernel connection files) while JupyterLab is live corrupts its
state and causes it to exit within seconds.
The primary guard is the --restore / sync separation introduced in
BUG FIX #3, but this TCP probe stays as a hard backstop for any future
code path that might call restore_once() unexpectedly.
"""
try:
with socket.create_connection(("127.0.0.1", port), timeout=2):
return True
except OSError:
return False
def restore_once(api, rid: str):
from huggingface_hub.errors import RepositoryNotFoundError
tmp = Path(tempfile.mkdtemp(prefix="devdata-restore-"))
try:
snapshot_download(repo_id=rid, repo_type="dataset", local_dir=str(tmp), local_dir_use_symlinks=False, token=HF_TOKEN)
for p in tmp.rglob("*"):
rel = p.relative_to(tmp)
if should_skip(rel):
continue
if str(rel) == ".gitattributes":
continue
target = JUPYTER_ROOT / rel
if p.is_dir():
target.mkdir(parents=True, exist_ok=True)
elif p.is_file():
target.parent.mkdir(parents=True, exist_ok=True)
try:
shutil.copy2(p, target)
except OSError as exc:
kind = classify_error(exc)
print(f"DevData restore skip [{kind}] (cannot write {target}): {exc}")
print(f"DevData restored from {rid}")
except RepositoryNotFoundError:
print(f"DevData dataset not found yet: {rid}")
except Exception as exc:
kind = classify_error(exc)
print(f"DevData restore warning [{kind}]: {exc}")
finally:
shutil.rmtree(tmp, ignore_errors=True)
def prune_remote_deleted_files(api, rid: str, snapshot_dir: Path) -> None:
"""BUG FIX #6: Delete from the HF dataset any files the user deleted
locally. Without this, deleted files re-appear on the next Space restart
because restore_once() copies everything in the dataset back to disk.
Mirrors the prune_remote_deleted_files() logic in openclaw-sync.py.
"""
try:
local_files = {
p.relative_to(snapshot_dir).as_posix()
for p in snapshot_dir.rglob("*")
if p.is_file()
}
remote_files = list(api.list_repo_files(repo_id=rid, repo_type="dataset"))
stale = [f for f in remote_files if f not in local_files and f != ".gitattributes"]
if stale:
api.delete_files(
delete_patterns=stale,
repo_id=rid,
repo_type="dataset",
commit_message=f"DevData prune {len(stale)} deleted file(s) {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
)
print(f"DevData pruned {len(stale)} deleted file(s) from {rid}")
except Exception as exc:
kind = classify_error(exc)
print(f"DevData prune warning [{kind}]: {exc}")
def sync_loop(api, rid: str):
while True:
tmp = Path(tempfile.mkdtemp(prefix="devdata-snap-"))
try:
snapshot(JUPYTER_ROOT, tmp)
upload_folder(
folder_path=str(tmp),
repo_id=rid,
repo_type="dataset",
token=HF_TOKEN,
commit_message=f"DevData sync {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
ignore_patterns=[".git/*", ".git"],
)
print(f"DevData synced to {rid}")
# BUG FIX #6: Prune files deleted locally so they don't reappear on restore.
prune_remote_deleted_files(api, rid, tmp)
except Exception as exc:
kind = classify_error(exc)
print(f"DevData sync warning [{kind}]: {exc}")
finally:
shutil.rmtree(tmp, ignore_errors=True)
time.sleep(INTERVAL)
if __name__ == "__main__":
if not enabled():
print("DevData sync disabled.")
raise SystemExit(0)
from huggingface_hub import HfApi, upload_folder, snapshot_download
from huggingface_hub.errors import RepositoryNotFoundError
api = HfApi(token=HF_TOKEN)
rid = repo_id(api)
try:
api.repo_info(repo_id=rid, repo_type="dataset")
except RepositoryNotFoundError:
api.create_repo(repo_id=rid, repo_type="dataset", private=True)
# ββ BUG FIX #3: Restore must happen BEFORE JupyterLab starts ββββββββββ
# The original code always called restore_once() here, but start.sh starts
# JupyterLab long before the gateway is ready and this script is launched.
# That made restore_once() ALWAYS run while JupyterLab was live, which
# overwrote its runtime/ sockets and settings β JupyterLab died.
#
# Fix: start.sh now calls `python3 jupyter-devdata-sync.py --restore`
# BEFORE starting JupyterLab. That --restore invocation does the restore
# and exits. This background invocation (no --restore flag) skips straight
# to sync_loop so it never touches files while JupyterLab is running.
#
# BUG FIX #2 (safety net): If JupyterLab is somehow already running when
# this code path is reached, abort restore to avoid corrupting its state.
if "--restore" in sys.argv:
# Synchronous restore mode β called by start.sh before JupyterLab.
validate_jupyter_paths()
restore_once(api, rid)
raise SystemExit(0)
# Normal background sync mode β no restore; go straight to upload loop.
validate_jupyter_paths()
if is_jupyter_running():
print("DevData: background sync started (JupyterLab is live, restore already done by --restore).")
else:
# Fallback: JupyterLab not detected. Should not normally happen
# because start.sh calls --restore before starting JupyterLab and then
# waits for the gateway before launching this background process.
# Log a warning and proceed to sync; do NOT restore to avoid racing
# with a JupyterLab that may be in the middle of starting up.
print("DevData: WARNING β JupyterLab not detected on port 8888. Skipping restore to be safe; starting sync loop.")
sync_loop(api, rid)
|