elonmusk / scripts /delete-backups.sh
GGSheng's picture
feat: deploy Gemma 4 to hf space
3b47d98 verified
#!/usr/bin/env bash
# ============================================================
# OpenClaw HF Dataset 备份清理脚本
#
# 根据指定日期删除 Hugging Face Dataset 中所有匹配的备份文件。
# 如日期为 20260427,则删除文件名中包含 openclaw-backup-20260427、
# openclaw-backup-20260426、openclaw-backup-20260425 … 及更早日期的备份文件。
#
# 用法:
# ./delete-backups.sh <DATASET_REPO> <DATE> [HF_TOKEN]
#
# 示例:
# ./scripts/delete-backups.sh GGSheng/page-backup 20260427 hf_xxxxx
#
# 方式1:直接提供参数
# ./scripts/delete-backups.sh GGSheng/page-backup 20260427 hf_xxxxx
#
# 方式2:设置环境变量
# export DATASET_REPO="GGSheng/page-backup"
# export DELETE_DATE="20260427"
# export HF_TOKEN="hf_xxxxx"
# ./scripts/delete-backups.sh
#
# 方式3:使用缓存的 token(默认从 ~/.cache/huggingface/token 读取)
# ./scripts/delete-backups.sh GGSheng/page-backup 20260427
#
# 环境变量 (可选):
# DATASET_REPO - Dataset 仓库 ID (如 GGSheng/page-backup)
# DELETE_DATE - 要清理的日期 (YYYYMMDD 格式)
# HF_TOKEN - Hugging Face API Token
# HF_TOKEN_FILE - Token 文件路径 (默认 ~/.cache/huggingface/token)
# DRY_RUN - 设为 1 时仅列出匹配文件而不删除 (默认 0)
# BATCH_DELETE - 设为 1 时使用批量删除 API (默认 0,逐条删除)
#
# 删除模式:
# BATCH_DELETE=0 (默认): 逐条删除,每条独立 commit,有重试/限速处理,容错性好
# BATCH_DELETE=1: 批量删除,每批最多 100 个文件,单次 commit 含多个文件,效率更高
#
# 注意事项:
# 1. 请确保 HF_TOKEN 有对该 Dataset 的 write 权限
# 2. 删除操作不可逆,建议先用 DRY_RUN=1 预览匹配的文件
# 3. 日期格式固定为 YYYYMMDD(如 20260427)
# 4. 批量删除模式使用 delete_files() API,单次 commit 含多个文件,失败会全部回滚
#
############################################################
# 与 push-to-space.sh 同样的参数模式
# ./scripts/delete-backups.sh GGSheng/page-backup 20260427 hf_xxxxx
# 使用缓存的 token
# ./scripts/delete-backups.sh GGSheng/page-backup 20260427
# 使用环境变量
# DATASET_REPO=GGSheng/page-backup DELETE_DATE=20260427 ./scripts/delete-backups.sh
# 仅预览
# DRY_RUN=1 ./scripts/delete-backups.sh GGSheng/page-backup 20260430
# 删除
# DRY_RUN=0 ./scripts/delete-backups.sh GGSheng/page-backup 20260430
# ============================================================
set -euo pipefail
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd -- "$SCRIPT_DIR/.." && pwd)"
# ---- 检测 uv 并设置 Python 命令 ----
if command -v uv &>/dev/null && [[ -f "$REPO_ROOT/pyproject.toml" ]]; then
PYTHON="uv run python3"
echo "[UV] Detected uv project, using: $PYTHON"
else
PYTHON="python3"
fi
# ---- 参数解析 ----
DATASET_REPO="${1:-${DATASET_REPO:-${OPENCLAW_BACKUP_DATASET_REPO:-}}}"
DELETE_DATE="${2:-${DELETE_DATE:-}}"
HF_TOKEN="${3:-${HF_TOKEN:-}}"
# ---- 前置检查 ----
if [[ -z "$DATASET_REPO" || -z "$DELETE_DATE" ]]; then
echo "Usage: $0 <DATASET_REPO> <DATE> [HF_TOKEN]"
echo ""
echo "Examples:"
echo " $0 GGSheng/page-backup 20260427 hf_xxxxx"
echo " DATASET_REPO=GGSheng/page-backup DELETE_DATE=20260427 HF_TOKEN=hf_xxxxx $0"
echo " $0 GGSheng/page-backup 20260427"
echo ""
echo "Environment variables:"
echo " DATASET_REPO - Dataset 仓库 ID (如 GGSheng/page-backup)"
echo " DELETE_DATE - 要清理的日期 (YYYYMMDD)"
echo " HF_TOKEN - Hugging Face API Token"
echo " HF_TOKEN_FILE - Token 文件路径 (默认 ~/.cache/huggingface/token)"
echo " DRY_RUN - 设为 1 仅预览 (默认 0)"
echo " BATCH_DELETE - 设为 1 使用批量删除 (默认 0)"
exit 1
fi
if ! [[ "$DELETE_DATE" =~ ^[0-9]{8}$ ]]; then
echo "Error: 日期格式错误 '$DELETE_DATE',应为 YYYYMMDD (如 20260427)"
exit 1
fi
# 验证日期合法性
if ! date -d "${DELETE_DATE:0:4}-${DELETE_DATE:4:2}-${DELETE_DATE:6:2}" &>/dev/null 2>&1; then
echo "Error: 无效的日期 '$DELETE_DATE'"
exit 1
fi
# ---- 解析 HF Token ----
if [[ -z "$HF_TOKEN" ]]; then
HF_TOKEN_FILE="${HF_TOKEN_FILE:-$HOME/.cache/huggingface/token}"
if [[ -f "$HF_TOKEN_FILE" ]]; then
HF_TOKEN="$(cat "$HF_TOKEN_FILE")"
fi
fi
if [[ -z "$HF_TOKEN" ]]; then
echo "Error: HF_TOKEN is required. Provide as 3rd arg, set HF_TOKEN env var, or ensure ~/.cache/huggingface/token exists."
exit 1
fi
DRY_RUN="${DRY_RUN:-0}"
REPO_TYPE="${REPO_TYPE:-dataset}"
BATCH_DELETE="${BATCH_DELETE:-0}"
echo "============================================"
echo "OpenClaw HF Dataset Cleanup Script"
echo "============================================"
echo "Dataset: $DATASET_REPO"
echo "Delete From: ${DELETE_DATE:0:4}-${DELETE_DATE:4:2}-${DELETE_DATE:6:2}"
echo "DRY_RUN: $([ "$DRY_RUN" = "1" ] && echo 'YES (preview only)' || echo 'NO')"
echo "Delete Mode: $([ "$BATCH_DELETE" = "1" ] && echo 'BATCH (delete_files API)' || echo 'INDIVIDUAL (delete_file per file)')"
echo ""
# ---- 临时文件清理 ----
CLEANUP_FILES=()
cleanup() {
for f in "${CLEANUP_FILES[@]}"; do
rm -f "$f"
done
}
trap cleanup EXIT
# ---- 步骤 1:列出并筛选文件 ----
echo "[1/3] Fetching file list from Dataset..."
ALL_FILES_FILE="$(mktemp)"
MATCHED_FILES_FILE="$(mktemp)"
CLEANUP_FILES+=("$ALL_FILES_FILE" "$MATCHED_FILES_FILE")
# 确保 huggingface_hub 模块可用
if ! $PYTHON -c "import huggingface_hub" 2>/dev/null; then
echo "Error: 'huggingface_hub' module not found."
echo ""
echo "Install it with:"
echo " uv add huggingface_hub"
echo ""
echo "Or with CLI support:"
echo " uv add 'huggingface_hub[cli]'"
exit 1
fi
$PYTHON << PYEOF
import json, sys
from huggingface_hub import HfApi
token = """${HF_TOKEN}"""
repo_id = """${DATASET_REPO}"""
repo_type = """${REPO_TYPE}"""
max_date = ${DELETE_DATE}
api = HfApi(token=token)
try:
files = api.list_repo_files(repo_id=repo_id, repo_type=repo_type)
except Exception as e:
err = str(e)
print(f"[ERROR] Failed to list files: {e}", file=sys.stderr)
print(file=sys.stderr)
if "404" in err:
print(f"[HINT] Dataset '{repo_id}' not found.", file=sys.stderr)
print(f"[HINT] Make sure you specified the Dataset repo, not the Space repo.", file=sys.stderr)
print(f"[HINT] Backup dataset is typically named like '<space-name>-backup'.", file=sys.stderr)
print(f"[HINT] Check your OPENCLAW_BACKUP_DATASET_REPO env var.", file=sys.stderr)
sys.exit(1)
matched = []
for f in files:
parts = f.split("openclaw-backup-")
if len(parts) >= 2:
date_part = parts[1][:8]
if date_part.isdigit() and int(date_part) <= max_date:
matched.append(f)
print(f"Total files: {len(files)}, matched: {len(matched)}")
with open("${ALL_FILES_FILE//\\///}", 'w') as fh:
json.dump(files, fh)
with open("${MATCHED_FILES_FILE//\\///}", 'w') as fh:
json.dump(matched, fh)
PYEOF
MATCHED_COUNT=$($PYTHON -c "import json; print(len(json.load(open('${MATCHED_FILES_FILE//\\///}'))))")
if [[ "$MATCHED_COUNT" -eq 0 ]]; then
echo "No matching backup files found. Nothing to clean up."
exit 0
fi
echo ""
echo "Matched files:"
$PYTHON -c "
import json
files = json.load(open('${MATCHED_FILES_FILE//\\///}'))
for f in sorted(files):
print(f' - {f}')
"
# ---- 步骤 2:确认 / DRY_RUN ----
echo ""
echo "[2/3] Ready to delete $MATCHED_COUNT file(s)..."
if [[ "$DRY_RUN" = "1" ]]; then
echo "DRY_RUN mode enabled. Set DRY_RUN=0 to actually delete."
exit 0
fi
# ---- 步骤 3:执行删除 ----
echo "[3/3] Deleting files..."
BATCH_DELETE="${BATCH_DELETE:-0}"
if [[ "$BATCH_DELETE" = "1" ]]; then
echo "Mode: BATCH DELETE (using delete_files API)"
echo ""
$PYTHON << PYEOF_BATCH
import json, sys
from huggingface_hub import HfApi
token = """${HF_TOKEN}"""
repo_id = """${DATASET_REPO}"""
repo_type = """${REPO_TYPE}"""
delete_date = """${DELETE_DATE}"""
with open("${MATCHED_FILES_FILE//\\///}", 'r') as fh:
files_to_delete = json.load(fh)
api = HfApi(token=token)
# 分批删除,每批最多 100 个文件(避免单次 commit 过大)
BATCH_SIZE = 100
total = len(files_to_delete)
total_batches = (total + BATCH_SIZE - 1) // BATCH_SIZE
deleted_total = 0
for batch_idx in range(total_batches):
start = batch_idx * BATCH_SIZE
end = min(start + BATCH_SIZE, total)
batch = files_to_delete[start:end]
try:
api.delete_files(
repo_id=repo_id,
repo_type=repo_type,
paths=batch,
commit_message=f"backup cleanup: batch {batch_idx + 1}/{total_batches} ({len(batch)} files, date {delete_date})"
)
deleted_total += len(batch)
print(f" Batch {batch_idx + 1}/{total_batches}: deleted {len(batch)} files")
except Exception as e:
print(f" Batch {batch_idx + 1}/{total_batches}: FAILED - {e}")
print(f"\nResult: deleted {deleted_total}/{total} files before failure")
sys.exit(1)
print(f"\nResult: deleted all {total} files in {total_batches} batch(es)")
PYEOF_BATCH
else
echo "Mode: INDIVIDUAL DELETE (one by one with retry)"
echo ""
$PYTHON << PYEOF_INDIVIDUAL
import json, sys, time
token = """${HF_TOKEN}"""
repo_id = """${DATASET_REPO}"""
repo_type = """${REPO_TYPE}"""
delete_date = """${DELETE_DATE}"""
with open("${MATCHED_FILES_FILE//\\///}", 'r') as fh:
files_to_delete = json.load(fh)
from huggingface_hub import HfApi
api = HfApi(token=token)
total = len(files_to_delete)
deleted = 0
failed = 0
rate_limit_hits = 0
max_retries = 3
for idx, path in enumerate(sorted(files_to_delete), 1):
retries = 0
while retries < max_retries:
try:
api.delete_file(
repo_id=repo_id,
repo_type=repo_type,
path_in_repo=path,
commit_message=f"backup cleanup: delete {path} (date {delete_date})"
)
deleted += 1
print(f" [{idx}/{total}] Deleted: {path}")
break
except Exception as e:
err = str(e)
if "404" in err or "Entry Not Found" in err:
print(f" [{idx}/{total}] Skipped (not found): {path}")
deleted += 1
break
elif "429" in err or "Too Many Requests" in err:
retries += 1
rate_limit_hits += 1
if retries >= max_retries:
print(f" [{idx}/{total}] Failed (rate limit): {path}")
failed += 1
break
wait = min(2 ** retries * 15, 1800)
print(f" [{idx}/{total}] Rate limited, waiting {wait}s...")
time.sleep(wait)
continue
else:
print(f" [{idx}/{total}] Failed: {path} - {err}")
failed += 1
break
print(f"\nResult: deleted {deleted}, failed {failed}, rate limit hits {rate_limit_hits}")
PYEOF_INDIVIDUAL
fi
echo ""
echo "============================================"
echo "Done! Cleanup finished."
echo "============================================"