| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
|
|
| |
| |
|
|
| |
| |
| |
| |
| |
|
|
| set -euo pipefail |
|
|
| SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" |
| REPO_ROOT="$(cd -- "$SCRIPT_DIR/.." && pwd)" |
|
|
| |
| if command -v uv &>/dev/null && [[ -f "$REPO_ROOT/pyproject.toml" ]]; then |
| PYTHON="uv run python3" |
| echo "[UV] Detected uv project, using: $PYTHON" |
| else |
| PYTHON="python3" |
| fi |
|
|
| |
| DATASET_REPO="${1:-${DATASET_REPO:-${OPENCLAW_BACKUP_DATASET_REPO:-}}}" |
| DELETE_DATE="${2:-${DELETE_DATE:-}}" |
| HF_TOKEN="${3:-${HF_TOKEN:-}}" |
|
|
| |
| if [[ -z "$DATASET_REPO" || -z "$DELETE_DATE" ]]; then |
| echo "Usage: $0 <DATASET_REPO> <DATE> [HF_TOKEN]" |
| echo "" |
| echo "Examples:" |
| echo " $0 GGSheng/page-backup 20260427 hf_xxxxx" |
| echo " DATASET_REPO=GGSheng/page-backup DELETE_DATE=20260427 HF_TOKEN=hf_xxxxx $0" |
| echo " $0 GGSheng/page-backup 20260427" |
| echo "" |
| echo "Environment variables:" |
| echo " DATASET_REPO - Dataset 仓库 ID (如 GGSheng/page-backup)" |
| echo " DELETE_DATE - 要清理的日期 (YYYYMMDD)" |
| echo " HF_TOKEN - Hugging Face API Token" |
| echo " HF_TOKEN_FILE - Token 文件路径 (默认 ~/.cache/huggingface/token)" |
| echo " DRY_RUN - 设为 1 仅预览 (默认 0)" |
| echo " BATCH_DELETE - 设为 1 使用批量删除 (默认 0)" |
| exit 1 |
| fi |
|
|
| if ! [[ "$DELETE_DATE" =~ ^[0-9]{8}$ ]]; then |
| echo "Error: 日期格式错误 '$DELETE_DATE',应为 YYYYMMDD (如 20260427)" |
| exit 1 |
| fi |
|
|
| |
| if ! date -d "${DELETE_DATE:0:4}-${DELETE_DATE:4:2}-${DELETE_DATE:6:2}" &>/dev/null 2>&1; then |
| echo "Error: 无效的日期 '$DELETE_DATE'" |
| exit 1 |
| fi |
|
|
| |
| if [[ -z "$HF_TOKEN" ]]; then |
| HF_TOKEN_FILE="${HF_TOKEN_FILE:-$HOME/.cache/huggingface/token}" |
| if [[ -f "$HF_TOKEN_FILE" ]]; then |
| HF_TOKEN="$(cat "$HF_TOKEN_FILE")" |
| fi |
| fi |
|
|
| if [[ -z "$HF_TOKEN" ]]; then |
| echo "Error: HF_TOKEN is required. Provide as 3rd arg, set HF_TOKEN env var, or ensure ~/.cache/huggingface/token exists." |
| exit 1 |
| fi |
|
|
| DRY_RUN="${DRY_RUN:-0}" |
| REPO_TYPE="${REPO_TYPE:-dataset}" |
| BATCH_DELETE="${BATCH_DELETE:-0}" |
|
|
| echo "============================================" |
| echo "OpenClaw HF Dataset Cleanup Script" |
| echo "============================================" |
| echo "Dataset: $DATASET_REPO" |
| echo "Delete From: ${DELETE_DATE:0:4}-${DELETE_DATE:4:2}-${DELETE_DATE:6:2}" |
| echo "DRY_RUN: $([ "$DRY_RUN" = "1" ] && echo 'YES (preview only)' || echo 'NO')" |
| echo "Delete Mode: $([ "$BATCH_DELETE" = "1" ] && echo 'BATCH (delete_files API)' || echo 'INDIVIDUAL (delete_file per file)')" |
| echo "" |
|
|
| |
| CLEANUP_FILES=() |
| cleanup() { |
| for f in "${CLEANUP_FILES[@]}"; do |
| rm -f "$f" |
| done |
| } |
| trap cleanup EXIT |
|
|
| |
| echo "[1/3] Fetching file list from Dataset..." |
|
|
| ALL_FILES_FILE="$(mktemp)" |
| MATCHED_FILES_FILE="$(mktemp)" |
| CLEANUP_FILES+=("$ALL_FILES_FILE" "$MATCHED_FILES_FILE") |
|
|
| |
| if ! $PYTHON -c "import huggingface_hub" 2>/dev/null; then |
| echo "Error: 'huggingface_hub' module not found." |
| echo "" |
| echo "Install it with:" |
| echo " uv add huggingface_hub" |
| echo "" |
| echo "Or with CLI support:" |
| echo " uv add 'huggingface_hub[cli]'" |
| exit 1 |
| fi |
|
|
| $PYTHON << PYEOF |
| import json, sys |
| from huggingface_hub import HfApi |
| |
| token = """${HF_TOKEN}""" |
| repo_id = """${DATASET_REPO}""" |
| repo_type = """${REPO_TYPE}""" |
| max_date = ${DELETE_DATE} |
| |
| api = HfApi(token=token) |
| try: |
| files = api.list_repo_files(repo_id=repo_id, repo_type=repo_type) |
| except Exception as e: |
| err = str(e) |
| print(f"[ERROR] Failed to list files: {e}", file=sys.stderr) |
| print(file=sys.stderr) |
| if "404" in err: |
| print(f"[HINT] Dataset '{repo_id}' not found.", file=sys.stderr) |
| print(f"[HINT] Make sure you specified the Dataset repo, not the Space repo.", file=sys.stderr) |
| print(f"[HINT] Backup dataset is typically named like '<space-name>-backup'.", file=sys.stderr) |
| print(f"[HINT] Check your OPENCLAW_BACKUP_DATASET_REPO env var.", file=sys.stderr) |
| sys.exit(1) |
| |
| matched = [] |
| for f in files: |
| parts = f.split("openclaw-backup-") |
| if len(parts) >= 2: |
| date_part = parts[1][:8] |
| if date_part.isdigit() and int(date_part) <= max_date: |
| matched.append(f) |
| |
| print(f"Total files: {len(files)}, matched: {len(matched)}") |
| |
| with open("${ALL_FILES_FILE//\\///}", 'w') as fh: |
| json.dump(files, fh) |
| with open("${MATCHED_FILES_FILE//\\///}", 'w') as fh: |
| json.dump(matched, fh) |
| PYEOF |
|
|
| MATCHED_COUNT=$($PYTHON -c "import json; print(len(json.load(open('${MATCHED_FILES_FILE//\\///}'))))") |
|
|
| if [[ "$MATCHED_COUNT" -eq 0 ]]; then |
| echo "No matching backup files found. Nothing to clean up." |
| exit 0 |
| fi |
|
|
| echo "" |
| echo "Matched files:" |
| $PYTHON -c " |
| import json |
| files = json.load(open('${MATCHED_FILES_FILE//\\///}')) |
| for f in sorted(files): |
| print(f' - {f}') |
| " |
|
|
| |
| echo "" |
| echo "[2/3] Ready to delete $MATCHED_COUNT file(s)..." |
|
|
| if [[ "$DRY_RUN" = "1" ]]; then |
| echo "DRY_RUN mode enabled. Set DRY_RUN=0 to actually delete." |
| exit 0 |
| fi |
|
|
| |
| echo "[3/3] Deleting files..." |
|
|
| BATCH_DELETE="${BATCH_DELETE:-0}" |
|
|
| if [[ "$BATCH_DELETE" = "1" ]]; then |
| echo "Mode: BATCH DELETE (using delete_files API)" |
| echo "" |
|
|
| $PYTHON << PYEOF_BATCH |
| import json, sys |
| from huggingface_hub import HfApi |
| |
| token = """${HF_TOKEN}""" |
| repo_id = """${DATASET_REPO}""" |
| repo_type = """${REPO_TYPE}""" |
| delete_date = """${DELETE_DATE}""" |
| |
| with open("${MATCHED_FILES_FILE//\\///}", 'r') as fh: |
| files_to_delete = json.load(fh) |
| |
| api = HfApi(token=token) |
| |
| # 分批删除,每批最多 100 个文件(避免单次 commit 过大) |
| BATCH_SIZE = 100 |
| total = len(files_to_delete) |
| total_batches = (total + BATCH_SIZE - 1) // BATCH_SIZE |
| deleted_total = 0 |
| |
| for batch_idx in range(total_batches): |
| start = batch_idx * BATCH_SIZE |
| end = min(start + BATCH_SIZE, total) |
| batch = files_to_delete[start:end] |
| |
| try: |
| api.delete_files( |
| repo_id=repo_id, |
| repo_type=repo_type, |
| paths=batch, |
| commit_message=f"backup cleanup: batch {batch_idx + 1}/{total_batches} ({len(batch)} files, date {delete_date})" |
| ) |
| deleted_total += len(batch) |
| print(f" Batch {batch_idx + 1}/{total_batches}: deleted {len(batch)} files") |
| except Exception as e: |
| print(f" Batch {batch_idx + 1}/{total_batches}: FAILED - {e}") |
| print(f"\nResult: deleted {deleted_total}/{total} files before failure") |
| sys.exit(1) |
| |
| print(f"\nResult: deleted all {total} files in {total_batches} batch(es)") |
| PYEOF_BATCH |
|
|
| else |
| echo "Mode: INDIVIDUAL DELETE (one by one with retry)" |
| echo "" |
|
|
| $PYTHON << PYEOF_INDIVIDUAL |
| import json, sys, time |
| |
| token = """${HF_TOKEN}""" |
| repo_id = """${DATASET_REPO}""" |
| repo_type = """${REPO_TYPE}""" |
| delete_date = """${DELETE_DATE}""" |
| |
| with open("${MATCHED_FILES_FILE//\\///}", 'r') as fh: |
| files_to_delete = json.load(fh) |
| |
| from huggingface_hub import HfApi |
| api = HfApi(token=token) |
| |
| total = len(files_to_delete) |
| deleted = 0 |
| failed = 0 |
| rate_limit_hits = 0 |
| max_retries = 3 |
| |
| for idx, path in enumerate(sorted(files_to_delete), 1): |
| retries = 0 |
| while retries < max_retries: |
| try: |
| api.delete_file( |
| repo_id=repo_id, |
| repo_type=repo_type, |
| path_in_repo=path, |
| commit_message=f"backup cleanup: delete {path} (date {delete_date})" |
| ) |
| deleted += 1 |
| print(f" [{idx}/{total}] Deleted: {path}") |
| break |
| except Exception as e: |
| err = str(e) |
| if "404" in err or "Entry Not Found" in err: |
| print(f" [{idx}/{total}] Skipped (not found): {path}") |
| deleted += 1 |
| break |
| elif "429" in err or "Too Many Requests" in err: |
| retries += 1 |
| rate_limit_hits += 1 |
| if retries >= max_retries: |
| print(f" [{idx}/{total}] Failed (rate limit): {path}") |
| failed += 1 |
| break |
| wait = min(2 ** retries * 15, 1800) |
| print(f" [{idx}/{total}] Rate limited, waiting {wait}s...") |
| time.sleep(wait) |
| continue |
| else: |
| print(f" [{idx}/{total}] Failed: {path} - {err}") |
| failed += 1 |
| break |
| |
| print(f"\nResult: deleted {deleted}, failed {failed}, rate limit hits {rate_limit_hits}") |
| PYEOF_INDIVIDUAL |
| fi |
|
|
| echo "" |
| echo "============================================" |
| echo "Done! Cleanup finished." |
| echo "============================================" |
|
|