#!/usr/bin/env bash # ============================================================ # OpenClaw HF Dataset 备份清理脚本 # # 根据指定日期删除 Hugging Face Dataset 中所有匹配的备份文件。 # 如日期为 20260427,则删除文件名中包含 openclaw-backup-20260427、 # openclaw-backup-20260426、openclaw-backup-20260425 … 及更早日期的备份文件。 # 注意 包含“openclaw-backup-20260427、openclaw-backup-20260426、openclaw-backup-20260425” 这样的,都是文件名,不是文件夹。 # 以下是实际存在dataset中的文件路径示例 # backups/openclaw-backup-20260430-214003.tar.gz.enc.meta.json # backups/openclaw-backup-20260430-215503.tar.gz.enc # backups/openclaw-backup-20260430-215503.tar.gz.enc.meta.json # backups/openclaw-backup-20260430-220045.tar.gz.enc # backups/openclaw-backup-20260430-220045.tar.gz.enc.meta.json # backups/openclaw-backup-20260430-221503.tar.gz.enc # backups/openclaw-backup-20260430-221503.tar.gz.enc.meta.json # backups/openclaw-backup-20260430-223004.tar.gz.enc.meta.json # backups/openclaw-backup-20260430-223004.tar.gz.enc.part-aa # backups/openclaw-backup-20260430-223004.tar.gz.enc.part-ab # backups/openclaw-backup-20260430-223004.tar.gz.enc.part-ac # backups/openclaw-backup-20260430-223503.tar.gz.enc # backups/openclaw-backup-20260430-223503.tar.gz.enc.meta.json # backups/openclaw-backup-20260430-225003.tar.gz.enc # backups/openclaw-backup-20260430-225003.tar.gz.enc.meta.json # backups/openclaw-backup-20260430-230502.tar.gz.enc # backups/openclaw-backup-20260430-230502.tar.gz.enc.meta.json # backups/openclaw-backup-20260430-232004.tar.gz.enc # backups/openclaw-backup-20260430-232004.tar.gz.enc.meta.json # backups/openclaw-backup-20260430-233503.tar.gz.enc.meta.json # backups/openclaw-backup-20260430-233503.tar.gz.enc.part-aa # backups/openclaw-backup-20260430-233503.tar.gz.enc.part-ab # backups/openclaw-backup-20260430-233503.tar.gz.enc.part-ac # backups/openclaw-backup-20260430-234003.tar.gz.enc # backups/openclaw-backup-20260430-234003.tar.gz.enc.meta.json # backups/openclaw-backup-20260430-235503.tar.gz.enc # backups/openclaw-backup-20260430-235503.tar.gz.enc.meta.json # # # 用法: # ./delete-backups.sh [HF_TOKEN] # # 示例: # ./scripts/delete-backups.sh GGSheng/page-backup 20260427 hf_xxxxx # # 方式1:直接提供参数 # ./scripts/delete-backups.sh GGSheng/page-backup 20260427 hf_xxxxx # # 方式2:设置环境变量 # export DATASET_REPO="GGSheng/page-backup" # export DELETE_DATE="20260427" # export HF_TOKEN="hf_xxxxx" # ./scripts/delete-backups.sh # # 方式3:使用缓存的 token(默认从 ~/.cache/huggingface/token 读取) # ./scripts/delete-backups.sh GGSheng/page-backup 20260427 # # 环境变量 (可选): # DATASET_REPO - Dataset 仓库 ID (如 GGSheng/page-backup) # DELETE_DATE - 要清理的日期 (YYYYMMDD 格式) # HF_TOKEN - Hugging Face API Token # HF_TOKEN_FILE - Token 文件路径 (默认 ~/.cache/huggingface/token) # DRY_RUN - 设为 1 时仅列出匹配文件而不删除 (默认 0) # BATCH_DELETE - 设为 1 时使用批量删除 API (默认 0,逐条删除) # # 删除模式: # BATCH_DELETE=0 (默认): 逐条删除,每条独立 commit,有重试/限速处理,容错性好 # BATCH_DELETE=1: 批量删除,每批最多 100 个文件,单次 commit 含多个文件,效率更高 # # 注意事项: # 1. 请确保 HF_TOKEN 有对该 Dataset 的 write 权限 # 2. 删除操作不可逆,建议先用 DRY_RUN=1 预览匹配的文件 # 3. 日期格式固定为 YYYYMMDD(如 20260427) # 4. 批量删除模式使用 delete_files() API,单次 commit 含多个文件,失败会全部回滚 # ############################################################ # 与 push-to-space.sh 同样的参数模式 # ./scripts/delete-backups.sh GGSheng/page-backup 20260427 hf_xxxxx # 使用缓存的 token # ./scripts/delete-backups.sh GGSheng/page-backup 20260427 # 使用环境变量 # DATASET_REPO=GGSheng/page-backup DELETE_DATE=20260427 ./scripts/delete-backups.sh # 仅预览 # DRY_RUN=1 ./scripts/delete-backups.sh GGSheng/page-backup 20260430 # 删除 # BATCH_DELETE=1 DRY_RUN=0 ./scripts/delete-backups.sh GGSheng/page-backup 20260430 # ============================================================ set -euo pipefail SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd -- "$SCRIPT_DIR/.." && pwd)" # ---- 检测 uv 并设置 Python 命令 ---- if command -v uv &>/dev/null && [[ -f "$REPO_ROOT/pyproject.toml" ]]; then PYTHON="uv run python3" echo "[UV] Detected uv project, using: $PYTHON" else PYTHON="python3" fi # ---- 参数解析 ---- DATASET_REPO="${1:-${DATASET_REPO:-${OPENCLAW_BACKUP_DATASET_REPO:-}}}" DELETE_DATE="${2:-${DELETE_DATE:-}}" HF_TOKEN="${3:-${HF_TOKEN:-}}" # ---- 前置检查 ---- if [[ -z "$DATASET_REPO" || -z "$DELETE_DATE" ]]; then echo "Usage: $0 [HF_TOKEN]" echo "" echo "Examples:" echo " $0 GGSheng/page-backup 20260427 hf_xxxxx" echo " DATASET_REPO=GGSheng/page-backup DELETE_DATE=20260427 HF_TOKEN=hf_xxxxx $0" echo " $0 GGSheng/page-backup 20260427" echo "" echo "Environment variables:" echo " DATASET_REPO - Dataset 仓库 ID (如 GGSheng/page-backup)" echo " DELETE_DATE - 要清理的日期 (YYYYMMDD)" echo " HF_TOKEN - Hugging Face API Token" echo " HF_TOKEN_FILE - Token 文件路径 (默认 ~/.cache/huggingface/token)" echo " DRY_RUN - 设为 1 仅预览 (默认 0)" echo " BATCH_DELETE - 设为 1 使用批量删除 (默认 0)" exit 1 fi if ! [[ "$DELETE_DATE" =~ ^[0-9]{8}$ ]]; then echo "Error: 日期格式错误 '$DELETE_DATE',应为 YYYYMMDD (如 20260427)" exit 1 fi # 验证日期合法性 if ! date -d "${DELETE_DATE:0:4}-${DELETE_DATE:4:2}-${DELETE_DATE:6:2}" &>/dev/null 2>&1; then echo "Error: 无效的日期 '$DELETE_DATE'" exit 1 fi # ---- 解析 HF Token ---- if [[ -z "$HF_TOKEN" ]]; then HF_TOKEN_FILE="${HF_TOKEN_FILE:-$HOME/.cache/huggingface/token}" if [[ -f "$HF_TOKEN_FILE" ]]; then HF_TOKEN="$(cat "$HF_TOKEN_FILE")" fi fi if [[ -z "$HF_TOKEN" ]]; then echo "Error: HF_TOKEN is required. Provide as 3rd arg, set HF_TOKEN env var, or ensure ~/.cache/huggingface/token exists." exit 1 fi DRY_RUN="${DRY_RUN:-0}" REPO_TYPE="${REPO_TYPE:-dataset}" BATCH_DELETE="${BATCH_DELETE:-0}" echo "============================================" echo "OpenClaw HF Dataset Cleanup Script" echo "============================================" echo "Dataset: $DATASET_REPO" echo "Delete From: ${DELETE_DATE:0:4}-${DELETE_DATE:4:2}-${DELETE_DATE:6:2}" echo "DRY_RUN: $([ "$DRY_RUN" = "1" ] && echo 'YES (preview only)' || echo 'NO')" echo "Delete Mode: $([ "$BATCH_DELETE" = "1" ] && echo 'BATCH (delete_files API)' || echo 'INDIVIDUAL (delete_file per file)')" echo "" # ---- 临时文件清理 ---- CLEANUP_FILES=() cleanup() { for f in "${CLEANUP_FILES[@]}"; do rm -f "$f" done } trap cleanup EXIT # ---- 步骤 1:列出并筛选文件 ---- echo "[1/3] Fetching file list from Dataset..." ALL_FILES_FILE="$(mktemp)" MATCHED_FILES_FILE="$(mktemp)" CLEANUP_FILES+=("$ALL_FILES_FILE" "$MATCHED_FILES_FILE") # 确保 huggingface_hub 模块可用 if ! $PYTHON -c "import huggingface_hub" 2>/dev/null; then echo "Error: 'huggingface_hub' module not found." echo "" echo "Install it with:" echo " uv add huggingface_hub" echo "" echo "Or with CLI support:" echo " uv add 'huggingface_hub[cli]'" exit 1 fi # ---- 打印 huggingface_hub 版本和安装位置信息 ---- echo "" echo "[INFO] huggingface_hub version and installation information:" # 1. 检查模块是否可用 if ! $PYTHON -c "import huggingface_hub" 2>/dev/null; then echo " ERROR: huggingface_hub module not found!" echo " Install it with: uv add huggingface_hub" echo "" exit 1 fi # 2. 获取模块版本 MODULE_VERSION=$($PYTHON -c "import huggingface_hub; print(huggingface_hub.__version__)" 2>/dev/null || echo "N/A") echo " - Module version: $MODULE_VERSION" # 3. 获取模块文件位置 MODULE_FILE=$($PYTHON -c "import huggingface_hub; print(huggingface_hub.__file__)" 2>/dev/null || echo "N/A") MODULE_DIR=$(dirname "$MODULE_FILE" 2>/dev/null || echo "N/A") echo " - Module location: $MODULE_DIR" # 4. 获取 pip 包版本和位置 PKG_VERSION="" PKG_LOCATION="" if $PYTHON -m pip --version &>/dev/null 2>&1; then PKG_VERSION=$($PYTHON -m pip show huggingface-hub 2>/dev/null | grep "^Version:" | awk '{print $2}') PKG_LOCATION=$($PYTHON -m pip show huggingface-hub 2>/dev/null | grep "^Location:" | awk '{print $2}') fi # 如果 pip 找不到,尝试 uv pip if [ -z "$PKG_VERSION" ] && command -v uv &>/dev/null; then PKG_VERSION=$(uv pip show huggingface-hub 2>/dev/null | grep "^Version:" | awk '{print $2}') PKG_LOCATION=$(uv pip show huggingface-hub 2>/dev/null | grep "^Location:" | awk '{print $2}') fi if [ -n "$PKG_VERSION" ]; then echo " - Package version: $PKG_VERSION" if [ -n "$PKG_LOCATION" ]; then echo " - Package location: $PKG_LOCATION" fi if [ "$MODULE_VERSION" != "N/A" ] && [ "$MODULE_VERSION" != "$PKG_VERSION" ]; then echo " (Note: Module version $MODULE_VERSION may differ from package version $PKG_VERSION)" fi else echo " - Package version: Unable to determine" fi # 5. 显示 Python 解释器和 sys.path PYTHON_EXEC=$($PYTHON -c "import sys; print(sys.executable)" 2>/dev/null || echo "N/A") echo " - Python executable: $PYTHON_EXEC" # 6. 检查 CLI 支持 (huggingface_hub[cli] 是同一包,但带有CLI额外依赖) CLI_AVAILABLE=$($PYTHON -c "import huggingface_hub.commands.huggingface_cli; print('YES')" 2>/dev/null || echo "NO") echo " - CLI support (huggingface_hub[cli]): $CLI_AVAILABLE" if [ "$CLI_AVAILABLE" = "NO" ]; then echo " (To install CLI support: uv add 'huggingface_hub[cli]' or pip install 'huggingface_hub[cli]')" fi # 7. 显示相关依赖包的位置(如果可能) echo "" echo "[INFO] Related dependencies installation locations:" $PYTHON << 'PYEOF_DEPS' import sys import pkgutil dependencies = ['huggingface_hub', 'requests', 'tqdm', 'typer'] print(" Checking module locations:") for dep in dependencies: try: mod = __import__(dep) if hasattr(mod, '__file__') and mod.__file__: print(f" - {dep}: {mod.__file__}") else: print(f" - {dep}: built-in or namespace package") except ImportError: print(f" - {dep}: NOT INSTALLED") PYEOF_DEPS echo "" $PYTHON << PYEOF import json, sys from huggingface_hub import HfApi token = """${HF_TOKEN}""" repo_id = """${DATASET_REPO}""" repo_type = """${REPO_TYPE}""" max_date = ${DELETE_DATE} api = HfApi(token=token) try: files = api.list_repo_files(repo_id=repo_id, repo_type=repo_type) except Exception as e: err = str(e) print(f"[ERROR] Failed to list files: {e}", file=sys.stderr) print(file=sys.stderr) if "404" in err: print(f"[HINT] Dataset '{repo_id}' not found.", file=sys.stderr) print(f"[HINT] Make sure you specified the Dataset repo, not the Space repo.", file=sys.stderr) print(f"[HINT] Backup dataset is typically named like '-backup'.", file=sys.stderr) print(f"[HINT] Check your OPENCLAW_BACKUP_DATASET_REPO env var.", file=sys.stderr) sys.exit(1) matched = [] date_set = set() # 收集所有需要删除的日期 for f in files: # 文件路径格式:backups/openclaw-backup-YYYYMMDD-HHMMSS.tar.gz.enc if "openclaw-backup-" in f: parts = f.split("openclaw-backup-") date_part = parts[1][:8] # 提取 YYYYMMDD if date_part.isdigit() and int(date_part) <= max_date: matched.append(f) date_set.add(date_part) print(f"Total files: {len(files)}, matched: {len(matched)}") # 生成通配符模式(每个日期一个模式) # 文件路径格式:backups/openclaw-backup-YYYYMMDD-HHMMSS.tar.gz.enc # 通配符格式:backups/openclaw-backup-YYYYMMDD* pattern_list = sorted([f"backups/openclaw-backup-{d}*" for d in date_set]) print(f"Delete patterns ({len(pattern_list)} date patterns):") for p in pattern_list: print(f" - {p}") with open("${ALL_FILES_FILE//\\///}", 'w') as fh: json.dump(files, fh) with open("${MATCHED_FILES_FILE//\\///}", 'w') as fh: json.dump(matched, fh) # 保存通配符模式列表,供后续删除使用 with open("${MATCHED_FILES_FILE//\\///}.patterns", 'w') as fh: json.dump(pattern_list, fh) PYEOF MATCHED_COUNT=$($PYTHON -c "import json; print(len(json.load(open('${MATCHED_FILES_FILE//\\///}'))))") if [[ "$MATCHED_COUNT" -eq 0 ]]; then echo "No matching backup files found. Nothing to clean up." exit 0 fi echo "" echo "Matched files:" $PYTHON -c " import json files = json.load(open('${MATCHED_FILES_FILE//\\///}')) for f in sorted(files): print(f' - {f}') " # ---- 步骤 2:确认 / DRY_RUN ---- echo "" echo "[2/3] Ready to delete $MATCHED_COUNT file(s)..." if [[ "$DRY_RUN" = "1" ]]; then echo "DRY_RUN mode enabled. Set DRY_RUN=0 to actually delete." exit 0 fi # ---- 步骤 3:执行删除 ---- echo "[3/3] Deleting files..." BATCH_DELETE="${BATCH_DELETE:-0}" if [[ "$BATCH_DELETE" = "1" ]]; then echo "Mode: BATCH DELETE (using delete_files API)" echo "" $PYTHON << PYEOF_BATCH import json, sys from huggingface_hub import HfApi token = """${HF_TOKEN}""" repo_id = """${DATASET_REPO}""" repo_type = """${REPO_TYPE}""" delete_date = """${DELETE_DATE}""" with open("${MATCHED_FILES_FILE//\\///}.patterns", 'r') as fh: patterns_to_delete = json.load(fh) api = HfApi(token=token) # 使用通配符模式删除,每个模式匹配一个日期的所有文件 # 例如:"openclaw-backup-20260101*" 匹配该日期的所有备份文件 BATCH_SIZE = 10 # 每批最多 10 个日期模式 total = len(patterns_to_delete) total_batches = (total + BATCH_SIZE - 1) // BATCH_SIZE deleted_total = 0 for batch_idx in range(total_batches): start = batch_idx * BATCH_SIZE end = min(start + BATCH_SIZE, total) batch = patterns_to_delete[start:end] try: api.delete_files( repo_id=repo_id, repo_type=repo_type, delete_patterns=batch, commit_message=f"backup cleanup: batch {batch_idx + 1}/{total_batches} ({len(batch)} date patterns, date {delete_date})" ) deleted_total += len(batch) print(f" Batch {batch_idx + 1}/{total_batches}: deleted files matching {len(batch)} date pattern(s)") except Exception as e: print(f" Batch {batch_idx + 1}/{total_batches}: FAILED - {e}") print(f"\nResult: deleted {deleted_total}/{total} date pattern(s) before failure") sys.exit(1) print(f"\nResult: deleted all files matching {total} date pattern(s) in {total_batches} batch(es)") PYEOF_BATCH else echo "Mode: INDIVIDUAL DELETE (one by one with retry)" echo "" $PYTHON << PYEOF_INDIVIDUAL import json, sys, time token = """${HF_TOKEN}""" repo_id = """${DATASET_REPO}""" repo_type = """${REPO_TYPE}""" delete_date = """${DELETE_DATE}""" with open("${MATCHED_FILES_FILE//\\///}", 'r') as fh: files_to_delete = json.load(fh) from huggingface_hub import HfApi api = HfApi(token=token) total = len(files_to_delete) deleted = 0 failed = 0 rate_limit_hits = 0 max_retries = 3 for idx, path in enumerate(sorted(files_to_delete), 1): retries = 0 while retries < max_retries: try: api.delete_file( repo_id=repo_id, repo_type=repo_type, path_in_repo=path, commit_message=f"backup cleanup: delete {path} (date {delete_date})" ) deleted += 1 print(f" [{idx}/{total}] Deleted: {path}") break except Exception as e: err = str(e) if "404" in err or "Entry Not Found" in err: print(f" [{idx}/{total}] Skipped (not found): {path}") deleted += 1 break elif "429" in err or "Too Many Requests" in err: retries += 1 rate_limit_hits += 1 if retries >= max_retries: print(f" [{idx}/{total}] Failed (rate limit): {path}") failed += 1 break wait = min(2 ** retries * 15, 1800) print(f" [{idx}/{total}] Rate limited, waiting {wait}s...") time.sleep(wait) continue else: print(f" [{idx}/{total}] Failed: {path} - {err}") failed += 1 break print(f"\nResult: deleted {deleted}, failed {failed}, rate limit hits {rate_limit_hits}") PYEOF_INDIVIDUAL fi echo "" echo "============================================" echo "Done! Cleanup finished." echo "============================================"