anurag008w commited on
Commit
f54eaff
Β·
1 Parent(s): c1bf580

Implement bug fixes for file handling and JupyterLab checks

Browse files

Added multiple bug fixes to handle file size limits, exclude paths correctly, and ensure JupyterLab is not running during restore.

Files changed (1) hide show
  1. jupyter-devdata-sync.py +111 -8
jupyter-devdata-sync.py CHANGED
@@ -1,7 +1,7 @@
1
  #!/usr/bin/env python3
2
  from __future__ import annotations
3
 
4
- import os, shutil, tempfile, time
5
  from pathlib import Path
6
 
7
  HF_TOKEN = os.environ.get("HF_TOKEN", "").strip()
@@ -10,6 +10,12 @@ DATASET_NAME = os.environ.get("DEVDATA_DATASET_NAME", "").strip() or "huggingcla
10
  BACKUP_DATASET_NAME = os.environ.get("BACKUP_DATASET_NAME", "").strip() or os.environ.get("BACKUP_DATASET", "").strip() or "huggingclaw-backup"
11
  JUPYTER_ROOT = Path(os.environ.get("JUPYTER_ROOT_DIR", "/home/node")).resolve()
12
  INTERVAL = int((os.environ.get("DEVDATA_SYNC_INTERVAL", "").strip() or "180"))
 
 
 
 
 
 
13
  def is_true(value):
14
  return str(value).strip().lower() in {"1", "true", "yes", "on"}
15
 
@@ -26,12 +32,17 @@ def classify_error(exc: Exception) -> str:
26
  return "safety-scan"
27
  return "general"
28
 
 
 
 
 
 
29
  EXCLUDE = {
30
  ".cache",
31
  "node_modules",
32
  ".npm",
33
  ".yarn",
34
- ".local/share/Trash",
35
  ".ipynb_checkpoints",
36
  ".openclaw",
37
  "app",
@@ -85,13 +96,38 @@ def snapshot(src: Path, dst: Path):
85
  if p.is_dir():
86
  target.mkdir(parents=True, exist_ok=True)
87
  elif p.is_file():
 
 
 
 
 
 
88
  target.parent.mkdir(parents=True, exist_ok=True)
89
  try:
90
  shutil.copy2(p, target)
91
  except OSError:
92
  pass
93
 
94
- def restore_once(api: HfApi, rid: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  tmp = Path(tempfile.mkdtemp(prefix="devdata-restore-"))
96
  try:
97
  snapshot_download(repo_id=rid, repo_type="dataset", local_dir=str(tmp), local_dir_use_symlinks=False, token=HF_TOKEN)
@@ -99,6 +135,8 @@ def restore_once(api: HfApi, rid: str):
99
  rel = p.relative_to(tmp)
100
  if should_skip(rel):
101
  continue
 
 
102
  target = JUPYTER_ROOT / rel
103
  if p.is_dir():
104
  target.mkdir(parents=True, exist_ok=True)
@@ -118,15 +156,48 @@ def restore_once(api: HfApi, rid: str):
118
  finally:
119
  shutil.rmtree(tmp, ignore_errors=True)
120
 
121
- def sync_loop(api: HfApi, rid: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  while True:
123
  tmp = Path(tempfile.mkdtemp(prefix="devdata-snap-"))
124
  try:
125
  snapshot(JUPYTER_ROOT, tmp)
126
- upload_folder(folder_path=str(tmp), repo_id=rid, repo_type="dataset", token=HF_TOKEN,
127
- commit_message=f"DevData sync {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
128
- ignore_patterns=[".git/*", ".git"])
 
 
 
 
 
129
  print(f"DevData synced to {rid}")
 
 
130
  except Exception as exc:
131
  kind = classify_error(exc)
132
  print(f"DevData sync warning [{kind}]: {exc}")
@@ -134,10 +205,12 @@ def sync_loop(api: HfApi, rid: str):
134
  shutil.rmtree(tmp, ignore_errors=True)
135
  time.sleep(INTERVAL)
136
 
 
137
  if __name__ == "__main__":
138
  if not enabled():
139
  print("DevData sync disabled.")
140
  raise SystemExit(0)
 
141
  from huggingface_hub import HfApi, upload_folder, snapshot_download
142
  from huggingface_hub.errors import RepositoryNotFoundError
143
 
@@ -147,6 +220,36 @@ if __name__ == "__main__":
147
  api.repo_info(repo_id=rid, repo_type="dataset")
148
  except RepositoryNotFoundError:
149
  api.create_repo(repo_id=rid, repo_type="dataset", private=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  validate_jupyter_paths()
151
- restore_once(api, rid)
 
 
 
 
 
 
 
 
 
152
  sync_loop(api, rid)
 
1
  #!/usr/bin/env python3
2
  from __future__ import annotations
3
 
4
+ import os, shutil, socket, sys, tempfile, time
5
  from pathlib import Path
6
 
7
  HF_TOKEN = os.environ.get("HF_TOKEN", "").strip()
 
10
  BACKUP_DATASET_NAME = os.environ.get("BACKUP_DATASET_NAME", "").strip() or os.environ.get("BACKUP_DATASET", "").strip() or "huggingclaw-backup"
11
  JUPYTER_ROOT = Path(os.environ.get("JUPYTER_ROOT_DIR", "/home/node")).resolve()
12
  INTERVAL = int((os.environ.get("DEVDATA_SYNC_INTERVAL", "").strip() or "180"))
13
+ # BUG FIX #5: Respect max file size so giant files don't stall uploads.
14
+ # Matches the 50 MB ceiling in openclaw-sync.py; override with DEVDATA_MAX_FILE_BYTES.
15
+ MAX_FILE_SIZE_BYTES = int(
16
+ (os.environ.get("DEVDATA_MAX_FILE_BYTES", "").strip() or str(50 * 1024 * 1024))
17
+ )
18
+
19
  def is_true(value):
20
  return str(value).strip().lower() in {"1", "true", "yes", "on"}
21
 
 
32
  return "safety-scan"
33
  return "general"
34
 
35
+ # BUG FIX #4: ".local/share/Trash" in the original EXCLUDE set was a
36
+ # multi-component path string that was never matched because parts-based
37
+ # lookup compares individual directory names. Added "Trash" as a standalone
38
+ # component so any path with a "Trash" segment (e.g. .local/share/Trash/*)
39
+ # is correctly skipped during snapshot and restore.
40
  EXCLUDE = {
41
  ".cache",
42
  "node_modules",
43
  ".npm",
44
  ".yarn",
45
+ "Trash", # BUG FIX #4: covers .local/share/Trash (was ".local/share/Trash" β€” never matched)
46
  ".ipynb_checkpoints",
47
  ".openclaw",
48
  "app",
 
96
  if p.is_dir():
97
  target.mkdir(parents=True, exist_ok=True)
98
  elif p.is_file():
99
+ # BUG FIX #5: Skip files that exceed the size limit.
100
+ try:
101
+ if p.stat().st_size > MAX_FILE_SIZE_BYTES:
102
+ continue
103
+ except OSError:
104
+ continue
105
  target.parent.mkdir(parents=True, exist_ok=True)
106
  try:
107
  shutil.copy2(p, target)
108
  except OSError:
109
  pass
110
 
111
+ def is_jupyter_running(port: int = 8888) -> bool:
112
+ """Return True if JupyterLab is already listening on *port*.
113
+
114
+ BUG FIX #2 (safety net): restore_once() must never run while JupyterLab
115
+ is active. Overwriting files under JUPYTER_ROOT (runtime/ sockets, lab/
116
+ settings, kernel connection files) while JupyterLab is live corrupts its
117
+ state and causes it to exit within seconds.
118
+
119
+ The primary guard is the --restore / sync separation introduced in
120
+ BUG FIX #3, but this TCP probe stays as a hard backstop for any future
121
+ code path that might call restore_once() unexpectedly.
122
+ """
123
+ try:
124
+ with socket.create_connection(("127.0.0.1", port), timeout=2):
125
+ return True
126
+ except OSError:
127
+ return False
128
+
129
+ def restore_once(api, rid: str):
130
+ from huggingface_hub.errors import RepositoryNotFoundError
131
  tmp = Path(tempfile.mkdtemp(prefix="devdata-restore-"))
132
  try:
133
  snapshot_download(repo_id=rid, repo_type="dataset", local_dir=str(tmp), local_dir_use_symlinks=False, token=HF_TOKEN)
 
135
  rel = p.relative_to(tmp)
136
  if should_skip(rel):
137
  continue
138
+ if str(rel) == ".gitattributes":
139
+ continue
140
  target = JUPYTER_ROOT / rel
141
  if p.is_dir():
142
  target.mkdir(parents=True, exist_ok=True)
 
156
  finally:
157
  shutil.rmtree(tmp, ignore_errors=True)
158
 
159
+ def prune_remote_deleted_files(api, rid: str, snapshot_dir: Path) -> None:
160
+ """BUG FIX #6: Delete from the HF dataset any files the user deleted
161
+ locally. Without this, deleted files re-appear on the next Space restart
162
+ because restore_once() copies everything in the dataset back to disk.
163
+ Mirrors the prune_remote_deleted_files() logic in openclaw-sync.py.
164
+ """
165
+ try:
166
+ local_files = {
167
+ p.relative_to(snapshot_dir).as_posix()
168
+ for p in snapshot_dir.rglob("*")
169
+ if p.is_file()
170
+ }
171
+ remote_files = list(api.list_repo_files(repo_id=rid, repo_type="dataset"))
172
+ stale = [f for f in remote_files if f not in local_files and f != ".gitattributes"]
173
+ if stale:
174
+ api.delete_files(
175
+ delete_patterns=stale,
176
+ repo_id=rid,
177
+ repo_type="dataset",
178
+ commit_message=f"DevData prune {len(stale)} deleted file(s) {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
179
+ )
180
+ print(f"DevData pruned {len(stale)} deleted file(s) from {rid}")
181
+ except Exception as exc:
182
+ kind = classify_error(exc)
183
+ print(f"DevData prune warning [{kind}]: {exc}")
184
+
185
+ def sync_loop(api, rid: str):
186
  while True:
187
  tmp = Path(tempfile.mkdtemp(prefix="devdata-snap-"))
188
  try:
189
  snapshot(JUPYTER_ROOT, tmp)
190
+ upload_folder(
191
+ folder_path=str(tmp),
192
+ repo_id=rid,
193
+ repo_type="dataset",
194
+ token=HF_TOKEN,
195
+ commit_message=f"DevData sync {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
196
+ ignore_patterns=[".git/*", ".git"],
197
+ )
198
  print(f"DevData synced to {rid}")
199
+ # BUG FIX #6: Prune files deleted locally so they don't reappear on restore.
200
+ prune_remote_deleted_files(api, rid, tmp)
201
  except Exception as exc:
202
  kind = classify_error(exc)
203
  print(f"DevData sync warning [{kind}]: {exc}")
 
205
  shutil.rmtree(tmp, ignore_errors=True)
206
  time.sleep(INTERVAL)
207
 
208
+
209
  if __name__ == "__main__":
210
  if not enabled():
211
  print("DevData sync disabled.")
212
  raise SystemExit(0)
213
+
214
  from huggingface_hub import HfApi, upload_folder, snapshot_download
215
  from huggingface_hub.errors import RepositoryNotFoundError
216
 
 
220
  api.repo_info(repo_id=rid, repo_type="dataset")
221
  except RepositoryNotFoundError:
222
  api.create_repo(repo_id=rid, repo_type="dataset", private=True)
223
+
224
+ # ── BUG FIX #3: Restore must happen BEFORE JupyterLab starts ──────────
225
+ # The original code always called restore_once() here, but start.sh starts
226
+ # JupyterLab long before the gateway is ready and this script is launched.
227
+ # That made restore_once() ALWAYS run while JupyterLab was live, which
228
+ # overwrote its runtime/ sockets and settings β†’ JupyterLab died.
229
+ #
230
+ # Fix: start.sh now calls `python3 jupyter-devdata-sync.py --restore`
231
+ # BEFORE starting JupyterLab. That --restore invocation does the restore
232
+ # and exits. This background invocation (no --restore flag) skips straight
233
+ # to sync_loop so it never touches files while JupyterLab is running.
234
+ #
235
+ # BUG FIX #2 (safety net): If JupyterLab is somehow already running when
236
+ # this code path is reached, abort restore to avoid corrupting its state.
237
+ if "--restore" in sys.argv:
238
+ # Synchronous restore mode β€” called by start.sh before JupyterLab.
239
+ validate_jupyter_paths()
240
+ restore_once(api, rid)
241
+ raise SystemExit(0)
242
+
243
+ # Normal background sync mode β€” no restore; go straight to upload loop.
244
  validate_jupyter_paths()
245
+ if is_jupyter_running():
246
+ print("DevData: background sync started (JupyterLab is live, restore already done by --restore).")
247
+ else:
248
+ # Fallback: JupyterLab not detected. Should not normally happen
249
+ # because start.sh calls --restore before starting JupyterLab and then
250
+ # waits for the gateway before launching this background process.
251
+ # Log a warning and proceed to sync; do NOT restore to avoid racing
252
+ # with a JupyterLab that may be in the middle of starting up.
253
+ print("DevData: WARNING β€” JupyterLab not detected on port 8888. Skipping restore to be safe; starting sync loop.")
254
+
255
  sync_loop(api, rid)