Spaces:
Running
Running
feat: implement file/directory exclusion logic and switch to upload_large_folder for workspace syncing
Browse files- workspace-sync.py +41 -11
workspace-sync.py
CHANGED
|
@@ -34,6 +34,13 @@ SPACE_AUTHOR_NAME = os.environ.get("SPACE_AUTHOR_NAME", "").strip()
|
|
| 34 |
BACKUP_DATASET_NAME = os.environ.get("BACKUP_DATASET_NAME", "huggingclaw-backup").strip()
|
| 35 |
WHATSAPP_ENABLED = os.environ.get("WHATSAPP_ENABLED", "").strip().lower() == "true"
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
STATE_DIR = WORKSPACE / ".huggingclaw-state"
|
| 38 |
OPENCLAW_STATE_BACKUP_DIR = STATE_DIR / "openclaw"
|
| 39 |
EXCLUDED_STATE_NAMES = {
|
|
@@ -183,6 +190,19 @@ def ensure_repo_exists() -> str:
|
|
| 183 |
return repo_id
|
| 184 |
|
| 185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
def metadata_marker(root: Path) -> tuple[int, int, int]:
|
| 187 |
if not root.exists():
|
| 188 |
return (0, 0, 0)
|
|
@@ -194,7 +214,7 @@ def metadata_marker(root: Path) -> tuple[int, int, int]:
|
|
| 194 |
if not path.is_file():
|
| 195 |
continue
|
| 196 |
rel = path.relative_to(root).as_posix()
|
| 197 |
-
if
|
| 198 |
continue
|
| 199 |
try:
|
| 200 |
stat = path.stat()
|
|
@@ -213,7 +233,7 @@ def fingerprint_dir(root: Path) -> str:
|
|
| 213 |
|
| 214 |
for path in sorted(p for p in root.rglob("*") if p.is_file()):
|
| 215 |
rel = path.relative_to(root).as_posix()
|
| 216 |
-
if
|
| 217 |
continue
|
| 218 |
hasher.update(rel.encode("utf-8"))
|
| 219 |
with path.open("rb") as handle:
|
|
@@ -227,7 +247,7 @@ def create_snapshot_dir(source_root: Path) -> Path:
|
|
| 227 |
for path in sorted(source_root.rglob("*")):
|
| 228 |
rel = path.relative_to(source_root)
|
| 229 |
rel_posix = rel.as_posix()
|
| 230 |
-
if
|
| 231 |
continue
|
| 232 |
target = staging_root / rel
|
| 233 |
if path.is_dir():
|
|
@@ -321,14 +341,24 @@ def sync_once(
|
|
| 321 |
write_status("syncing", f"Uploading workspace to {repo_id}")
|
| 322 |
snapshot_dir = create_snapshot_dir(WORKSPACE)
|
| 323 |
try:
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
finally:
|
| 333 |
shutil.rmtree(snapshot_dir, ignore_errors=True)
|
| 334 |
|
|
|
|
| 34 |
BACKUP_DATASET_NAME = os.environ.get("BACKUP_DATASET_NAME", "huggingclaw-backup").strip()
|
| 35 |
WHATSAPP_ENABLED = os.environ.get("WHATSAPP_ENABLED", "").strip().lower() == "true"
|
| 36 |
|
| 37 |
+
EXCLUDED_SYNC_DIRS = {
|
| 38 |
+
"node_modules", ".git", "__pycache__", ".venv", "venv",
|
| 39 |
+
".npm", ".cache", ".yarn", "dist", "build", ".next", ".nuxt",
|
| 40 |
+
".turbo", ".parcel-cache", "target", ".gradle", ".mvn",
|
| 41 |
+
}
|
| 42 |
+
MAX_FILE_SIZE_BYTES = int(os.environ.get("SYNC_MAX_FILE_BYTES", str(50 * 1024 * 1024)))
|
| 43 |
+
|
| 44 |
STATE_DIR = WORKSPACE / ".huggingclaw-state"
|
| 45 |
OPENCLAW_STATE_BACKUP_DIR = STATE_DIR / "openclaw"
|
| 46 |
EXCLUDED_STATE_NAMES = {
|
|
|
|
| 190 |
return repo_id
|
| 191 |
|
| 192 |
|
| 193 |
+
def _should_exclude(rel_posix: str, path: Path) -> bool:
|
| 194 |
+
parts = Path(rel_posix).parts
|
| 195 |
+
if any(part in EXCLUDED_SYNC_DIRS for part in parts):
|
| 196 |
+
return True
|
| 197 |
+
if path.is_file():
|
| 198 |
+
try:
|
| 199 |
+
if path.stat().st_size > MAX_FILE_SIZE_BYTES:
|
| 200 |
+
return True
|
| 201 |
+
except OSError:
|
| 202 |
+
pass
|
| 203 |
+
return False
|
| 204 |
+
|
| 205 |
+
|
| 206 |
def metadata_marker(root: Path) -> tuple[int, int, int]:
|
| 207 |
if not root.exists():
|
| 208 |
return (0, 0, 0)
|
|
|
|
| 214 |
if not path.is_file():
|
| 215 |
continue
|
| 216 |
rel = path.relative_to(root).as_posix()
|
| 217 |
+
if _should_exclude(rel, path):
|
| 218 |
continue
|
| 219 |
try:
|
| 220 |
stat = path.stat()
|
|
|
|
| 233 |
|
| 234 |
for path in sorted(p for p in root.rglob("*") if p.is_file()):
|
| 235 |
rel = path.relative_to(root).as_posix()
|
| 236 |
+
if _should_exclude(rel, path):
|
| 237 |
continue
|
| 238 |
hasher.update(rel.encode("utf-8"))
|
| 239 |
with path.open("rb") as handle:
|
|
|
|
| 247 |
for path in sorted(source_root.rglob("*")):
|
| 248 |
rel = path.relative_to(source_root)
|
| 249 |
rel_posix = rel.as_posix()
|
| 250 |
+
if _should_exclude(rel_posix, path):
|
| 251 |
continue
|
| 252 |
target = staging_root / rel
|
| 253 |
if path.is_dir():
|
|
|
|
| 341 |
write_status("syncing", f"Uploading workspace to {repo_id}")
|
| 342 |
snapshot_dir = create_snapshot_dir(WORKSPACE)
|
| 343 |
try:
|
| 344 |
+
try:
|
| 345 |
+
HF_API.upload_large_folder(
|
| 346 |
+
repo_id=repo_id,
|
| 347 |
+
repo_type="dataset",
|
| 348 |
+
folder_path=str(snapshot_dir),
|
| 349 |
+
path_in_repo=".",
|
| 350 |
+
num_workers=2,
|
| 351 |
+
print_report=False,
|
| 352 |
+
)
|
| 353 |
+
except AttributeError:
|
| 354 |
+
upload_folder(
|
| 355 |
+
folder_path=str(snapshot_dir),
|
| 356 |
+
repo_id=repo_id,
|
| 357 |
+
repo_type="dataset",
|
| 358 |
+
token=HF_TOKEN,
|
| 359 |
+
commit_message=f"HuggingClaw sync {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
|
| 360 |
+
ignore_patterns=[".git/*", ".git"],
|
| 361 |
+
)
|
| 362 |
finally:
|
| 363 |
shutil.rmtree(snapshot_dir, ignore_errors=True)
|
| 364 |
|