#!/usr/bin/env python3 from __future__ import annotations import os, shutil, tempfile, time from pathlib import Path HF_TOKEN = os.environ.get("HF_TOKEN", "").strip() HF_USERNAME = os.environ.get("HF_USERNAME", "").strip() or os.environ.get("SPACE_AUTHOR_NAME", "").strip() DATASET_NAME = os.environ.get("DEVDATA_DATASET_NAME", "").strip() or "huggingclaw-devdata" BACKUP_DATASET_NAME = os.environ.get("BACKUP_DATASET_NAME", "").strip() or os.environ.get("BACKUP_DATASET", "").strip() or "huggingclaw-backup" JUPYTER_ROOT = Path(os.environ.get("JUPYTER_ROOT_DIR", "/home/node")).resolve() INTERVAL = int((os.environ.get("DEVDATA_SYNC_INTERVAL", "").strip() or "180")) def is_true(value): return str(value).strip().lower() in {"1", "true", "yes", "on"} ENABLE = is_true(os.environ.get("DEVDATA", "on")) def classify_error(exc: Exception) -> str: msg = str(exc).lower() if isinstance(exc, PermissionError) or "permission denied" in msg: return "filesystem-permission" if any(k in msg for k in ("connection error", "fetch failed", "timeout", "temporarily unavailable", "network")): return "network-provider" if "unsafe" in msg or "malware" in msg or "security" in msg: return "safety-scan" return "general" EXCLUDE = { ".cache", "node_modules", ".npm", ".yarn", ".local/share/Trash", ".ipynb_checkpoints", ".openclaw", "app", "HuggingClaw", "HuggingClaw-Workspace", "browser-deps", } def enabled(): dev = is_true(os.environ.get("DEV_MODE", "")) separate_dataset = DATASET_NAME != BACKUP_DATASET_NAME if ENABLE and dev and HF_TOKEN and not separate_dataset: print("DevData sync disabled: DEVDATA_DATASET_NAME must be separate from BACKUP_DATASET_NAME.") return ENABLE and dev and bool(HF_TOKEN) and separate_dataset def validate_jupyter_paths() -> None: # JupyterLab theme/settings live under ~/.jupyter and ~/.local/share/jupyter. # If these are not writable, settings can appear to "reset" every restart. for required in (JUPYTER_ROOT, Path("/home/node/.jupyter"), Path("/home/node/.local/share/jupyter")): try: required.mkdir(parents=True, exist_ok=True) probe = required / ".devdata-write-check" probe.write_text("ok", encoding="utf-8") probe.unlink(missing_ok=True) except Exception as exc: kind = classify_error(exc) print(f"DevData warning [{kind}]: {required} is not writable; Jupyter settings may not persist ({exc})") def repo_id(api) -> str: ns = HF_USERNAME if not ns: who = api.whoami() ns = who.get("name") or who.get("user") or "" if not ns: raise RuntimeError("Cannot resolve HF namespace for devdata sync") return f"{ns}/{DATASET_NAME}" def should_skip(p: Path): parts = set(p.parts) return any(x in parts for x in EXCLUDE) def snapshot(src: Path, dst: Path): for p in src.rglob("*"): rel = p.relative_to(src) if should_skip(rel): continue if p.is_symlink(): continue target = dst / rel if p.is_dir(): target.mkdir(parents=True, exist_ok=True) elif p.is_file(): target.parent.mkdir(parents=True, exist_ok=True) try: shutil.copy2(p, target) except OSError: pass def restore_once(api: HfApi, rid: str): tmp = Path(tempfile.mkdtemp(prefix="devdata-restore-")) try: snapshot_download(repo_id=rid, repo_type="dataset", local_dir=str(tmp), local_dir_use_symlinks=False, token=HF_TOKEN) for p in tmp.rglob("*"): rel = p.relative_to(tmp) if should_skip(rel): continue target = JUPYTER_ROOT / rel if p.is_dir(): target.mkdir(parents=True, exist_ok=True) elif p.is_file(): target.parent.mkdir(parents=True, exist_ok=True) try: shutil.copy2(p, target) except OSError as exc: kind = classify_error(exc) print(f"DevData restore skip [{kind}] (cannot write {target}): {exc}") print(f"DevData restored from {rid}") except RepositoryNotFoundError: print(f"DevData dataset not found yet: {rid}") except Exception as exc: kind = classify_error(exc) print(f"DevData restore warning [{kind}]: {exc}") finally: shutil.rmtree(tmp, ignore_errors=True) def sync_loop(api: HfApi, rid: str): while True: tmp = Path(tempfile.mkdtemp(prefix="devdata-snap-")) try: snapshot(JUPYTER_ROOT, tmp) upload_folder(folder_path=str(tmp), repo_id=rid, repo_type="dataset", token=HF_TOKEN, commit_message=f"DevData sync {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}", ignore_patterns=[".git/*", ".git"]) print(f"DevData synced to {rid}") except Exception as exc: kind = classify_error(exc) print(f"DevData sync warning [{kind}]: {exc}") finally: shutil.rmtree(tmp, ignore_errors=True) time.sleep(INTERVAL) if __name__ == "__main__": if not enabled(): print("DevData sync disabled.") raise SystemExit(0) from huggingface_hub import HfApi, upload_folder, snapshot_download from huggingface_hub.errors import RepositoryNotFoundError api = HfApi(token=HF_TOKEN) rid = repo_id(api) try: api.repo_info(repo_id=rid, repo_type="dataset") except RepositoryNotFoundError: api.create_repo(repo_id=rid, repo_type="dataset", private=True) validate_jupyter_paths() restore_once(api, rid) sync_loop(api, rid)