File size: 12,607 Bytes
0e6fb59
 
 
f54eaff
0e6fb59
 
 
 
 
 
 
 
f54eaff
 
 
 
 
 
0e6fb59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f54eaff
 
 
 
 
0e6fb59
 
 
 
 
f54eaff
0e6fb59
 
 
 
 
 
7baec40
 
 
 
 
 
 
 
0e6fb59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
09e99f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e6fb59
09e99f8
 
 
 
 
 
0e6fb59
 
 
 
 
 
 
 
 
 
 
 
f54eaff
 
 
 
 
 
0e6fb59
 
 
 
 
 
f54eaff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e6fb59
 
 
 
 
 
 
f54eaff
 
0e6fb59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f54eaff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e6fb59
 
 
 
f54eaff
 
 
 
 
 
 
 
0e6fb59
f54eaff
 
0e6fb59
 
 
 
 
 
 
f54eaff
0e6fb59
 
 
 
f54eaff
0e6fb59
 
 
 
 
 
 
 
 
f54eaff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e6fb59
f54eaff
 
 
 
 
 
 
 
 
 
0e6fb59
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
#!/usr/bin/env python3
from __future__ import annotations

import os, shutil, socket, sys, tempfile, time
from pathlib import Path

HF_TOKEN = os.environ.get("HF_TOKEN", "").strip()
HF_USERNAME = os.environ.get("HF_USERNAME", "").strip() or os.environ.get("SPACE_AUTHOR_NAME", "").strip()
DATASET_NAME = os.environ.get("DEVDATA_DATASET_NAME", "").strip() or "huggingclaw-devdata"
BACKUP_DATASET_NAME = os.environ.get("BACKUP_DATASET_NAME", "").strip() or os.environ.get("BACKUP_DATASET", "").strip() or "huggingclaw-backup"
JUPYTER_ROOT = Path(os.environ.get("JUPYTER_ROOT_DIR", "/home/node")).resolve()
INTERVAL = int((os.environ.get("DEVDATA_SYNC_INTERVAL", "").strip() or "180"))
# BUG FIX #5: Respect max file size so giant files don't stall uploads.
# Matches the 50 MB ceiling in openclaw-sync.py; override with DEVDATA_MAX_FILE_BYTES.
MAX_FILE_SIZE_BYTES = int(
    (os.environ.get("DEVDATA_MAX_FILE_BYTES", "").strip() or str(50 * 1024 * 1024))
)

def is_true(value):
    return str(value).strip().lower() in {"1", "true", "yes", "on"}

ENABLE = is_true(os.environ.get("DEVDATA", "on"))


def classify_error(exc: Exception) -> str:
    msg = str(exc).lower()
    if isinstance(exc, PermissionError) or "permission denied" in msg:
        return "filesystem-permission"
    if any(k in msg for k in ("connection error", "fetch failed", "timeout", "temporarily unavailable", "network")):
        return "network-provider"
    if "unsafe" in msg or "malware" in msg or "security" in msg:
        return "safety-scan"
    return "general"

# BUG FIX #4: ".local/share/Trash" in the original EXCLUDE set was a
# multi-component path string that was never matched because parts-based
# lookup compares individual directory names.  Added "Trash" as a standalone
# component so any path with a "Trash" segment (e.g. .local/share/Trash/*)
# is correctly skipped during snapshot and restore.
EXCLUDE = {
    ".cache",
    "node_modules",
    ".npm",
    ".yarn",
    "Trash",            # BUG FIX #4: covers .local/share/Trash (was ".local/share/Trash" β€” never matched)
    ".ipynb_checkpoints",
    ".openclaw",
    "app",
    "HuggingClaw",
    "HuggingClaw-Workspace",
    "browser-deps",
    # Exclude Python/system package directories β€” these contain thousands of files
    # (e.g. .local/lib/python3.11/site-packages/) and must not be synced to the
    # HF Dataset. Syncing them causes 10,000+ file fetches on every restore and
    # can restore a broken jsonschema that crashes JupyterLab on boot.
    ".local",
    "lib",
    "site-packages",
    "__pycache__",
}


def enabled():
    dev = is_true(os.environ.get("DEV_MODE", ""))
    separate_dataset = DATASET_NAME != BACKUP_DATASET_NAME
    if ENABLE and dev and HF_TOKEN and not separate_dataset:
        print("DevData sync disabled: DEVDATA_DATASET_NAME must be separate from BACKUP_DATASET_NAME.")
    return ENABLE and dev and bool(HF_TOKEN) and separate_dataset

def validate_jupyter_paths() -> None:
    # JupyterLab theme/settings live under ~/.jupyter and ~/.local/share/jupyter.
    # If these are not writable, settings can appear to "reset" every restart.
    for required in (JUPYTER_ROOT, Path("/home/node/.jupyter"), Path("/home/node/.local/share/jupyter")):
        try:
            required.mkdir(parents=True, exist_ok=True)
            probe = required / ".devdata-write-check"
            probe.write_text("ok", encoding="utf-8")
            probe.unlink(missing_ok=True)
        except Exception as exc:
            kind = classify_error(exc)
            print(f"DevData warning [{kind}]: {required} is not writable; Jupyter settings may not persist ({exc})")

def repo_id(api) -> str:
    ns = HF_USERNAME
    if not ns:
        who = api.whoami()
        ns = who.get("name") or who.get("user") or ""
    if not ns:
        raise RuntimeError("Cannot resolve HF namespace for devdata sync")
    return f"{ns}/{DATASET_NAME}"

# Filename patterns that must never be synced to a public/private HF Dataset.
# These are matched against the *name* of each path component (not the full path),
# so ".env" matches /home/node/.env and /home/node/subdir/.env alike.
import fnmatch as _fnmatch

SECRET_FILENAME_PATTERNS = {
    ".env",           # dotenv files β€” almost always contain API keys
    ".env.*",         # .env.local, .env.production, etc.
    "*secret*",       # any file/dir whose name contains "secret"
    "*secrets*",
    "*_secret*",
    "*-secret*",
    "*key*",          # private keys, API key files
    "*_key*",
    "*-key*",
    "*token*",        # token files
    "*_token*",
    "*-token*",
    "*.pem",          # TLS/SSH private keys
    "*.key",          # generic key files
    "*.p12",          # PKCS#12 bundles
    "*.pfx",
    "credentials",    # common credential file names
    "credentials.*",
    ".netrc",         # stores plaintext passwords
    ".htpasswd",
}


def _name_is_secret(name: str) -> bool:
    """Return True if *name* matches any secret-exclusion pattern."""
    name_lower = name.lower()
    return any(_fnmatch.fnmatch(name_lower, pat) for pat in SECRET_FILENAME_PATTERNS)


def should_skip(p: Path):
    # Skip directories/files in the hard-coded exclude set.
    parts = p.parts
    if any(x in parts for x in EXCLUDE):
        return True
    # Skip any component whose name looks like a secret file/dir.
    return any(_name_is_secret(part) for part in parts)

def snapshot(src: Path, dst: Path):
    for p in src.rglob("*"):
        rel = p.relative_to(src)
        if should_skip(rel):
            continue
        if p.is_symlink():
            continue
        target = dst / rel
        if p.is_dir():
            target.mkdir(parents=True, exist_ok=True)
        elif p.is_file():
            # BUG FIX #5: Skip files that exceed the size limit.
            try:
                if p.stat().st_size > MAX_FILE_SIZE_BYTES:
                    continue
            except OSError:
                continue
            target.parent.mkdir(parents=True, exist_ok=True)
            try:
                shutil.copy2(p, target)
            except OSError:
                pass

def is_jupyter_running(port: int = 8888) -> bool:
    """Return True if JupyterLab is already listening on *port*.

    BUG FIX #2 (safety net): restore_once() must never run while JupyterLab
    is active.  Overwriting files under JUPYTER_ROOT (runtime/ sockets, lab/
    settings, kernel connection files) while JupyterLab is live corrupts its
    state and causes it to exit within seconds.

    The primary guard is the --restore / sync separation introduced in
    BUG FIX #3, but this TCP probe stays as a hard backstop for any future
    code path that might call restore_once() unexpectedly.
    """
    try:
        with socket.create_connection(("127.0.0.1", port), timeout=2):
            return True
    except OSError:
        return False

def restore_once(api, rid: str):
    from huggingface_hub.errors import RepositoryNotFoundError
    tmp = Path(tempfile.mkdtemp(prefix="devdata-restore-"))
    try:
        snapshot_download(repo_id=rid, repo_type="dataset", local_dir=str(tmp), local_dir_use_symlinks=False, token=HF_TOKEN)
        for p in tmp.rglob("*"):
            rel = p.relative_to(tmp)
            if should_skip(rel):
                continue
            if str(rel) == ".gitattributes":
                continue
            target = JUPYTER_ROOT / rel
            if p.is_dir():
                target.mkdir(parents=True, exist_ok=True)
            elif p.is_file():
                target.parent.mkdir(parents=True, exist_ok=True)
                try:
                    shutil.copy2(p, target)
                except OSError as exc:
                    kind = classify_error(exc)
                    print(f"DevData restore skip [{kind}] (cannot write {target}): {exc}")
        print(f"DevData restored from {rid}")
    except RepositoryNotFoundError:
        print(f"DevData dataset not found yet: {rid}")
    except Exception as exc:
        kind = classify_error(exc)
        print(f"DevData restore warning [{kind}]: {exc}")
    finally:
        shutil.rmtree(tmp, ignore_errors=True)

def prune_remote_deleted_files(api, rid: str, snapshot_dir: Path) -> None:
    """BUG FIX #6: Delete from the HF dataset any files the user deleted
    locally.  Without this, deleted files re-appear on the next Space restart
    because restore_once() copies everything in the dataset back to disk.
    Mirrors the prune_remote_deleted_files() logic in openclaw-sync.py.
    """
    try:
        local_files = {
            p.relative_to(snapshot_dir).as_posix()
            for p in snapshot_dir.rglob("*")
            if p.is_file()
        }
        remote_files = list(api.list_repo_files(repo_id=rid, repo_type="dataset"))
        stale = [f for f in remote_files if f not in local_files and f != ".gitattributes"]
        if stale:
            api.delete_files(
                delete_patterns=stale,
                repo_id=rid,
                repo_type="dataset",
                commit_message=f"DevData prune {len(stale)} deleted file(s) {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
            )
            print(f"DevData pruned {len(stale)} deleted file(s) from {rid}")
    except Exception as exc:
        kind = classify_error(exc)
        print(f"DevData prune warning [{kind}]: {exc}")

def sync_loop(api, rid: str):
    while True:
        tmp = Path(tempfile.mkdtemp(prefix="devdata-snap-"))
        try:
            snapshot(JUPYTER_ROOT, tmp)
            upload_folder(
                folder_path=str(tmp),
                repo_id=rid,
                repo_type="dataset",
                token=HF_TOKEN,
                commit_message=f"DevData sync {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
                ignore_patterns=[".git/*", ".git"],
            )
            print(f"DevData synced to {rid}")
            # BUG FIX #6: Prune files deleted locally so they don't reappear on restore.
            prune_remote_deleted_files(api, rid, tmp)
        except Exception as exc:
            kind = classify_error(exc)
            print(f"DevData sync warning [{kind}]: {exc}")
        finally:
            shutil.rmtree(tmp, ignore_errors=True)
        time.sleep(INTERVAL)


if __name__ == "__main__":
    if not enabled():
        print("DevData sync disabled.")
        raise SystemExit(0)

    from huggingface_hub import HfApi, upload_folder, snapshot_download
    from huggingface_hub.errors import RepositoryNotFoundError

    api = HfApi(token=HF_TOKEN)
    rid = repo_id(api)
    try:
        api.repo_info(repo_id=rid, repo_type="dataset")
    except RepositoryNotFoundError:
        api.create_repo(repo_id=rid, repo_type="dataset", private=True)

    # ── BUG FIX #3: Restore must happen BEFORE JupyterLab starts ──────────
    # The original code always called restore_once() here, but start.sh starts
    # JupyterLab long before the gateway is ready and this script is launched.
    # That made restore_once() ALWAYS run while JupyterLab was live, which
    # overwrote its runtime/ sockets and settings β†’ JupyterLab died.
    #
    # Fix: start.sh now calls  `python3 jupyter-devdata-sync.py --restore`
    # BEFORE starting JupyterLab.  That --restore invocation does the restore
    # and exits.  This background invocation (no --restore flag) skips straight
    # to sync_loop so it never touches files while JupyterLab is running.
    #
    # BUG FIX #2 (safety net): If JupyterLab is somehow already running when
    # this code path is reached, abort restore to avoid corrupting its state.
    if "--restore" in sys.argv:
        # Synchronous restore mode β€” called by start.sh before JupyterLab.
        validate_jupyter_paths()
        restore_once(api, rid)
        raise SystemExit(0)

    # Normal background sync mode β€” no restore; go straight to upload loop.
    validate_jupyter_paths()
    if is_jupyter_running():
        print("DevData: background sync started (JupyterLab is live, restore already done by --restore).")
    else:
        # Fallback: JupyterLab not detected.  Should not normally happen
        # because start.sh calls --restore before starting JupyterLab and then
        # waits for the gateway before launching this background process.
        # Log a warning and proceed to sync; do NOT restore to avoid racing
        # with a JupyterLab that may be in the middle of starting up.
        print("DevData: WARNING β€” JupyterLab not detected on port 8888. Skipping restore to be safe; starting sync loop.")

    sync_loop(api, rid)