somratpro commited on
Commit
915aa9e
·
1 Parent(s): aea0c64

feat: implement file/directory exclusion logic and switch to upload_large_folder for workspace syncing

Browse files
Files changed (1) hide show
  1. workspace-sync.py +41 -11
workspace-sync.py CHANGED
@@ -34,6 +34,13 @@ SPACE_AUTHOR_NAME = os.environ.get("SPACE_AUTHOR_NAME", "").strip()
34
  BACKUP_DATASET_NAME = os.environ.get("BACKUP_DATASET_NAME", "huggingclaw-backup").strip()
35
  WHATSAPP_ENABLED = os.environ.get("WHATSAPP_ENABLED", "").strip().lower() == "true"
36
 
 
 
 
 
 
 
 
37
  STATE_DIR = WORKSPACE / ".huggingclaw-state"
38
  OPENCLAW_STATE_BACKUP_DIR = STATE_DIR / "openclaw"
39
  EXCLUDED_STATE_NAMES = {
@@ -183,6 +190,19 @@ def ensure_repo_exists() -> str:
183
  return repo_id
184
 
185
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  def metadata_marker(root: Path) -> tuple[int, int, int]:
187
  if not root.exists():
188
  return (0, 0, 0)
@@ -194,7 +214,7 @@ def metadata_marker(root: Path) -> tuple[int, int, int]:
194
  if not path.is_file():
195
  continue
196
  rel = path.relative_to(root).as_posix()
197
- if rel.startswith(".git/"):
198
  continue
199
  try:
200
  stat = path.stat()
@@ -213,7 +233,7 @@ def fingerprint_dir(root: Path) -> str:
213
 
214
  for path in sorted(p for p in root.rglob("*") if p.is_file()):
215
  rel = path.relative_to(root).as_posix()
216
- if rel.startswith(".git/"):
217
  continue
218
  hasher.update(rel.encode("utf-8"))
219
  with path.open("rb") as handle:
@@ -227,7 +247,7 @@ def create_snapshot_dir(source_root: Path) -> Path:
227
  for path in sorted(source_root.rglob("*")):
228
  rel = path.relative_to(source_root)
229
  rel_posix = rel.as_posix()
230
- if rel_posix.startswith(".git/") or rel_posix == ".git":
231
  continue
232
  target = staging_root / rel
233
  if path.is_dir():
@@ -321,14 +341,24 @@ def sync_once(
321
  write_status("syncing", f"Uploading workspace to {repo_id}")
322
  snapshot_dir = create_snapshot_dir(WORKSPACE)
323
  try:
324
- upload_folder(
325
- folder_path=str(snapshot_dir),
326
- repo_id=repo_id,
327
- repo_type="dataset",
328
- token=HF_TOKEN,
329
- commit_message=f"HuggingClaw sync {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
330
- ignore_patterns=[".git/*", ".git"],
331
- )
 
 
 
 
 
 
 
 
 
 
332
  finally:
333
  shutil.rmtree(snapshot_dir, ignore_errors=True)
334
 
 
34
  BACKUP_DATASET_NAME = os.environ.get("BACKUP_DATASET_NAME", "huggingclaw-backup").strip()
35
  WHATSAPP_ENABLED = os.environ.get("WHATSAPP_ENABLED", "").strip().lower() == "true"
36
 
37
+ EXCLUDED_SYNC_DIRS = {
38
+ "node_modules", ".git", "__pycache__", ".venv", "venv",
39
+ ".npm", ".cache", ".yarn", "dist", "build", ".next", ".nuxt",
40
+ ".turbo", ".parcel-cache", "target", ".gradle", ".mvn",
41
+ }
42
+ MAX_FILE_SIZE_BYTES = int(os.environ.get("SYNC_MAX_FILE_BYTES", str(50 * 1024 * 1024)))
43
+
44
  STATE_DIR = WORKSPACE / ".huggingclaw-state"
45
  OPENCLAW_STATE_BACKUP_DIR = STATE_DIR / "openclaw"
46
  EXCLUDED_STATE_NAMES = {
 
190
  return repo_id
191
 
192
 
193
+ def _should_exclude(rel_posix: str, path: Path) -> bool:
194
+ parts = Path(rel_posix).parts
195
+ if any(part in EXCLUDED_SYNC_DIRS for part in parts):
196
+ return True
197
+ if path.is_file():
198
+ try:
199
+ if path.stat().st_size > MAX_FILE_SIZE_BYTES:
200
+ return True
201
+ except OSError:
202
+ pass
203
+ return False
204
+
205
+
206
  def metadata_marker(root: Path) -> tuple[int, int, int]:
207
  if not root.exists():
208
  return (0, 0, 0)
 
214
  if not path.is_file():
215
  continue
216
  rel = path.relative_to(root).as_posix()
217
+ if _should_exclude(rel, path):
218
  continue
219
  try:
220
  stat = path.stat()
 
233
 
234
  for path in sorted(p for p in root.rglob("*") if p.is_file()):
235
  rel = path.relative_to(root).as_posix()
236
+ if _should_exclude(rel, path):
237
  continue
238
  hasher.update(rel.encode("utf-8"))
239
  with path.open("rb") as handle:
 
247
  for path in sorted(source_root.rglob("*")):
248
  rel = path.relative_to(source_root)
249
  rel_posix = rel.as_posix()
250
+ if _should_exclude(rel_posix, path):
251
  continue
252
  target = staging_root / rel
253
  if path.is_dir():
 
341
  write_status("syncing", f"Uploading workspace to {repo_id}")
342
  snapshot_dir = create_snapshot_dir(WORKSPACE)
343
  try:
344
+ try:
345
+ HF_API.upload_large_folder(
346
+ repo_id=repo_id,
347
+ repo_type="dataset",
348
+ folder_path=str(snapshot_dir),
349
+ path_in_repo=".",
350
+ num_workers=2,
351
+ print_report=False,
352
+ )
353
+ except AttributeError:
354
+ upload_folder(
355
+ folder_path=str(snapshot_dir),
356
+ repo_id=repo_id,
357
+ repo_type="dataset",
358
+ token=HF_TOKEN,
359
+ commit_message=f"HuggingClaw sync {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
360
+ ignore_patterns=[".git/*", ".git"],
361
+ )
362
  finally:
363
  shutil.rmtree(snapshot_dir, ignore_errors=True)
364