Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| from __future__ import annotations | |
| import os, shutil, tempfile, time | |
| from pathlib import Path | |
| HF_TOKEN = os.environ.get("HF_TOKEN", "").strip() | |
| HF_USERNAME = os.environ.get("HF_USERNAME", "").strip() or os.environ.get("SPACE_AUTHOR_NAME", "").strip() | |
| DATASET_NAME = os.environ.get("DEVDATA_DATASET_NAME", "").strip() or "huggingclaw-devdata" | |
| BACKUP_DATASET_NAME = os.environ.get("BACKUP_DATASET_NAME", "").strip() or os.environ.get("BACKUP_DATASET", "").strip() or "huggingclaw-backup" | |
| JUPYTER_ROOT = Path(os.environ.get("JUPYTER_ROOT_DIR", "/home/node")).resolve() | |
| INTERVAL = int((os.environ.get("DEVDATA_SYNC_INTERVAL", "").strip() or "180")) | |
| def is_true(value): | |
| return str(value).strip().lower() in {"1", "true", "yes", "on"} | |
| ENABLE = is_true(os.environ.get("DEVDATA", "on")) | |
| def classify_error(exc: Exception) -> str: | |
| msg = str(exc).lower() | |
| if isinstance(exc, PermissionError) or "permission denied" in msg: | |
| return "filesystem-permission" | |
| if any(k in msg for k in ("connection error", "fetch failed", "timeout", "temporarily unavailable", "network")): | |
| return "network-provider" | |
| if "unsafe" in msg or "malware" in msg or "security" in msg: | |
| return "safety-scan" | |
| return "general" | |
| EXCLUDE = { | |
| ".cache", | |
| "node_modules", | |
| ".npm", | |
| ".yarn", | |
| ".local/share/Trash", | |
| ".ipynb_checkpoints", | |
| ".openclaw", | |
| "app", | |
| "HuggingClaw", | |
| "HuggingClaw-Workspace", | |
| "browser-deps", | |
| } | |
| def enabled(): | |
| dev = is_true(os.environ.get("DEV_MODE", "")) | |
| separate_dataset = DATASET_NAME != BACKUP_DATASET_NAME | |
| if ENABLE and dev and HF_TOKEN and not separate_dataset: | |
| print("DevData sync disabled: DEVDATA_DATASET_NAME must be separate from BACKUP_DATASET_NAME.") | |
| return ENABLE and dev and bool(HF_TOKEN) and separate_dataset | |
| def validate_jupyter_paths() -> None: | |
| # JupyterLab theme/settings live under ~/.jupyter and ~/.local/share/jupyter. | |
| # If these are not writable, settings can appear to "reset" every restart. | |
| for required in (JUPYTER_ROOT, Path("/home/node/.jupyter"), Path("/home/node/.local/share/jupyter")): | |
| try: | |
| required.mkdir(parents=True, exist_ok=True) | |
| probe = required / ".devdata-write-check" | |
| probe.write_text("ok", encoding="utf-8") | |
| probe.unlink(missing_ok=True) | |
| except Exception as exc: | |
| kind = classify_error(exc) | |
| print(f"DevData warning [{kind}]: {required} is not writable; Jupyter settings may not persist ({exc})") | |
| def repo_id(api) -> str: | |
| ns = HF_USERNAME | |
| if not ns: | |
| who = api.whoami() | |
| ns = who.get("name") or who.get("user") or "" | |
| if not ns: | |
| raise RuntimeError("Cannot resolve HF namespace for devdata sync") | |
| return f"{ns}/{DATASET_NAME}" | |
| def should_skip(p: Path): | |
| parts = set(p.parts) | |
| return any(x in parts for x in EXCLUDE) | |
| def snapshot(src: Path, dst: Path): | |
| for p in src.rglob("*"): | |
| rel = p.relative_to(src) | |
| if should_skip(rel): | |
| continue | |
| if p.is_symlink(): | |
| continue | |
| target = dst / rel | |
| if p.is_dir(): | |
| target.mkdir(parents=True, exist_ok=True) | |
| elif p.is_file(): | |
| target.parent.mkdir(parents=True, exist_ok=True) | |
| try: | |
| shutil.copy2(p, target) | |
| except OSError: | |
| pass | |
| def restore_once(api: HfApi, rid: str): | |
| tmp = Path(tempfile.mkdtemp(prefix="devdata-restore-")) | |
| try: | |
| snapshot_download(repo_id=rid, repo_type="dataset", local_dir=str(tmp), local_dir_use_symlinks=False, token=HF_TOKEN) | |
| for p in tmp.rglob("*"): | |
| rel = p.relative_to(tmp) | |
| if should_skip(rel): | |
| continue | |
| target = JUPYTER_ROOT / rel | |
| if p.is_dir(): | |
| target.mkdir(parents=True, exist_ok=True) | |
| elif p.is_file(): | |
| target.parent.mkdir(parents=True, exist_ok=True) | |
| try: | |
| shutil.copy2(p, target) | |
| except OSError as exc: | |
| kind = classify_error(exc) | |
| print(f"DevData restore skip [{kind}] (cannot write {target}): {exc}") | |
| print(f"DevData restored from {rid}") | |
| except RepositoryNotFoundError: | |
| print(f"DevData dataset not found yet: {rid}") | |
| except Exception as exc: | |
| kind = classify_error(exc) | |
| print(f"DevData restore warning [{kind}]: {exc}") | |
| finally: | |
| shutil.rmtree(tmp, ignore_errors=True) | |
| def sync_loop(api: HfApi, rid: str): | |
| while True: | |
| tmp = Path(tempfile.mkdtemp(prefix="devdata-snap-")) | |
| try: | |
| snapshot(JUPYTER_ROOT, tmp) | |
| upload_folder(folder_path=str(tmp), repo_id=rid, repo_type="dataset", token=HF_TOKEN, | |
| commit_message=f"DevData sync {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}", | |
| ignore_patterns=[".git/*", ".git"]) | |
| print(f"DevData synced to {rid}") | |
| except Exception as exc: | |
| kind = classify_error(exc) | |
| print(f"DevData sync warning [{kind}]: {exc}") | |
| finally: | |
| shutil.rmtree(tmp, ignore_errors=True) | |
| time.sleep(INTERVAL) | |
| if __name__ == "__main__": | |
| if not enabled(): | |
| print("DevData sync disabled.") | |
| raise SystemExit(0) | |
| from huggingface_hub import HfApi, upload_folder, snapshot_download | |
| from huggingface_hub.errors import RepositoryNotFoundError | |
| api = HfApi(token=HF_TOKEN) | |
| rid = repo_id(api) | |
| try: | |
| api.repo_info(repo_id=rid, repo_type="dataset") | |
| except RepositoryNotFoundError: | |
| api.create_repo(repo_id=rid, repo_type="dataset", private=True) | |
| validate_jupyter_paths() | |
| restore_once(api, rid) | |
| sync_loop(api, rid) | |