File size: 11,253 Bytes
3b47d98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
#!/usr/bin/env bash
# ============================================================
# OpenClaw HF Dataset 备份清理脚本
#
# 根据指定日期删除 Hugging Face Dataset 中所有匹配的备份文件。
# 如日期为 20260427,则删除文件名中包含 openclaw-backup-20260427、
# openclaw-backup-20260426、openclaw-backup-20260425 … 及更早日期的备份文件。
#
# 用法:
#   ./delete-backups.sh <DATASET_REPO> <DATE> [HF_TOKEN]
#
# 示例:
#   ./scripts/delete-backups.sh GGSheng/page-backup 20260427 hf_xxxxx
#
#   方式1:直接提供参数
#   ./scripts/delete-backups.sh GGSheng/page-backup 20260427 hf_xxxxx
#
#   方式2:设置环境变量
#   export DATASET_REPO="GGSheng/page-backup"
#   export DELETE_DATE="20260427"
#   export HF_TOKEN="hf_xxxxx"
#   ./scripts/delete-backups.sh
#
#   方式3:使用缓存的 token(默认从 ~/.cache/huggingface/token 读取)
#   ./scripts/delete-backups.sh GGSheng/page-backup 20260427
#
# 环境变量 (可选):
#   DATASET_REPO   - Dataset 仓库 ID (如 GGSheng/page-backup)
#   DELETE_DATE    - 要清理的日期 (YYYYMMDD 格式)
#   HF_TOKEN       - Hugging Face API Token
#   HF_TOKEN_FILE  - Token 文件路径 (默认 ~/.cache/huggingface/token)
#   DRY_RUN        - 设为 1 时仅列出匹配文件而不删除 (默认 0)
#   BATCH_DELETE   - 设为 1 时使用批量删除 API (默认 0,逐条删除)
#
# 删除模式:
#   BATCH_DELETE=0 (默认): 逐条删除,每条独立 commit,有重试/限速处理,容错性好
#   BATCH_DELETE=1: 批量删除,每批最多 100 个文件,单次 commit 含多个文件,效率更高
#
# 注意事项:
#   1. 请确保 HF_TOKEN 有对该 Dataset 的 write 权限
#   2. 删除操作不可逆,建议先用 DRY_RUN=1 预览匹配的文件
#   3. 日期格式固定为 YYYYMMDD(如 20260427)
#   4. 批量删除模式使用 delete_files() API,单次 commit 含多个文件,失败会全部回滚
#
############################################################
# 与 push-to-space.sh 同样的参数模式
# ./scripts/delete-backups.sh GGSheng/page-backup 20260427 hf_xxxxx

# 使用缓存的 token
# ./scripts/delete-backups.sh GGSheng/page-backup 20260427

# 使用环境变量
# DATASET_REPO=GGSheng/page-backup DELETE_DATE=20260427 ./scripts/delete-backups.sh

# 仅预览
#  DRY_RUN=1 ./scripts/delete-backups.sh GGSheng/page-backup 20260430
# 删除
#  DRY_RUN=0 ./scripts/delete-backups.sh GGSheng/page-backup 20260430
# ============================================================

set -euo pipefail

SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd -- "$SCRIPT_DIR/.." && pwd)"

# ---- 检测 uv 并设置 Python 命令 ----
if command -v uv &>/dev/null && [[ -f "$REPO_ROOT/pyproject.toml" ]]; then
  PYTHON="uv run python3"
  echo "[UV] Detected uv project, using: $PYTHON"
else
  PYTHON="python3"
fi

# ---- 参数解析 ----
DATASET_REPO="${1:-${DATASET_REPO:-${OPENCLAW_BACKUP_DATASET_REPO:-}}}"
DELETE_DATE="${2:-${DELETE_DATE:-}}"
HF_TOKEN="${3:-${HF_TOKEN:-}}"

# ---- 前置检查 ----
if [[ -z "$DATASET_REPO" || -z "$DELETE_DATE" ]]; then
  echo "Usage: $0 <DATASET_REPO> <DATE> [HF_TOKEN]"
  echo ""
  echo "Examples:"
  echo "  $0 GGSheng/page-backup 20260427 hf_xxxxx"
  echo "  DATASET_REPO=GGSheng/page-backup DELETE_DATE=20260427 HF_TOKEN=hf_xxxxx $0"
  echo "  $0 GGSheng/page-backup 20260427"
  echo ""
  echo "Environment variables:"
  echo "  DATASET_REPO   - Dataset 仓库 ID (如 GGSheng/page-backup)"
  echo "  DELETE_DATE    - 要清理的日期 (YYYYMMDD)"
  echo "  HF_TOKEN       - Hugging Face API Token"
  echo "  HF_TOKEN_FILE  - Token 文件路径 (默认 ~/.cache/huggingface/token)"
  echo "  DRY_RUN        - 设为 1 仅预览 (默认 0)"
  echo "  BATCH_DELETE   - 设为 1 使用批量删除 (默认 0)"
  exit 1
fi

if ! [[ "$DELETE_DATE" =~ ^[0-9]{8}$ ]]; then
  echo "Error: 日期格式错误 '$DELETE_DATE',应为 YYYYMMDD (如 20260427)"
  exit 1
fi

# 验证日期合法性
if ! date -d "${DELETE_DATE:0:4}-${DELETE_DATE:4:2}-${DELETE_DATE:6:2}" &>/dev/null 2>&1; then
  echo "Error: 无效的日期 '$DELETE_DATE'"
  exit 1
fi

# ---- 解析 HF Token ----
if [[ -z "$HF_TOKEN" ]]; then
  HF_TOKEN_FILE="${HF_TOKEN_FILE:-$HOME/.cache/huggingface/token}"
  if [[ -f "$HF_TOKEN_FILE" ]]; then
    HF_TOKEN="$(cat "$HF_TOKEN_FILE")"
  fi
fi

if [[ -z "$HF_TOKEN" ]]; then
  echo "Error: HF_TOKEN is required. Provide as 3rd arg, set HF_TOKEN env var, or ensure ~/.cache/huggingface/token exists."
  exit 1
fi

DRY_RUN="${DRY_RUN:-0}"
REPO_TYPE="${REPO_TYPE:-dataset}"
BATCH_DELETE="${BATCH_DELETE:-0}"

echo "============================================"
echo "OpenClaw HF Dataset Cleanup Script"
echo "============================================"
echo "Dataset:     $DATASET_REPO"
echo "Delete From: ${DELETE_DATE:0:4}-${DELETE_DATE:4:2}-${DELETE_DATE:6:2}"
echo "DRY_RUN:     $([ "$DRY_RUN" = "1" ] && echo 'YES (preview only)' || echo 'NO')"
echo "Delete Mode: $([ "$BATCH_DELETE" = "1" ] && echo 'BATCH (delete_files API)' || echo 'INDIVIDUAL (delete_file per file)')"
echo ""

# ---- 临时文件清理 ----
CLEANUP_FILES=()
cleanup() {
  for f in "${CLEANUP_FILES[@]}"; do
    rm -f "$f"
  done
}
trap cleanup EXIT

# ---- 步骤 1:列出并筛选文件 ----
echo "[1/3] Fetching file list from Dataset..."

ALL_FILES_FILE="$(mktemp)"
MATCHED_FILES_FILE="$(mktemp)"
CLEANUP_FILES+=("$ALL_FILES_FILE" "$MATCHED_FILES_FILE")

# 确保 huggingface_hub 模块可用
if ! $PYTHON -c "import huggingface_hub" 2>/dev/null; then
  echo "Error: 'huggingface_hub' module not found."
  echo ""
  echo "Install it with:"
  echo "  uv add huggingface_hub"
  echo ""
  echo "Or with CLI support:"
  echo "  uv add 'huggingface_hub[cli]'"
  exit 1
fi

$PYTHON << PYEOF
import json, sys
from huggingface_hub import HfApi

token = """${HF_TOKEN}"""
repo_id = """${DATASET_REPO}"""
repo_type = """${REPO_TYPE}"""
max_date = ${DELETE_DATE}

api = HfApi(token=token)
try:
    files = api.list_repo_files(repo_id=repo_id, repo_type=repo_type)
except Exception as e:
    err = str(e)
    print(f"[ERROR] Failed to list files: {e}", file=sys.stderr)
    print(file=sys.stderr)
    if "404" in err:
        print(f"[HINT] Dataset '{repo_id}' not found.", file=sys.stderr)
        print(f"[HINT] Make sure you specified the Dataset repo, not the Space repo.", file=sys.stderr)
        print(f"[HINT] Backup dataset is typically named like '<space-name>-backup'.", file=sys.stderr)
        print(f"[HINT] Check your OPENCLAW_BACKUP_DATASET_REPO env var.", file=sys.stderr)
    sys.exit(1)

matched = []
for f in files:
    parts = f.split("openclaw-backup-")
    if len(parts) >= 2:
        date_part = parts[1][:8]
        if date_part.isdigit() and int(date_part) <= max_date:
            matched.append(f)

print(f"Total files: {len(files)}, matched: {len(matched)}")

with open("${ALL_FILES_FILE//\\///}", 'w') as fh:
    json.dump(files, fh)
with open("${MATCHED_FILES_FILE//\\///}", 'w') as fh:
    json.dump(matched, fh)
PYEOF

MATCHED_COUNT=$($PYTHON -c "import json; print(len(json.load(open('${MATCHED_FILES_FILE//\\///}'))))")

if [[ "$MATCHED_COUNT" -eq 0 ]]; then
  echo "No matching backup files found. Nothing to clean up."
  exit 0
fi

echo ""
echo "Matched files:"
$PYTHON -c "
import json
files = json.load(open('${MATCHED_FILES_FILE//\\///}'))
for f in sorted(files):
    print(f'  - {f}')
"

# ---- 步骤 2:确认 / DRY_RUN ----
echo ""
echo "[2/3] Ready to delete $MATCHED_COUNT file(s)..."

if [[ "$DRY_RUN" = "1" ]]; then
  echo "DRY_RUN mode enabled. Set DRY_RUN=0 to actually delete."
  exit 0
fi

# ---- 步骤 3:执行删除 ----
echo "[3/3] Deleting files..."

BATCH_DELETE="${BATCH_DELETE:-0}"

if [[ "$BATCH_DELETE" = "1" ]]; then
    echo "Mode: BATCH DELETE (using delete_files API)"
    echo ""

    $PYTHON << PYEOF_BATCH
import json, sys
from huggingface_hub import HfApi

token = """${HF_TOKEN}"""
repo_id = """${DATASET_REPO}"""
repo_type = """${REPO_TYPE}"""
delete_date = """${DELETE_DATE}"""

with open("${MATCHED_FILES_FILE//\\///}", 'r') as fh:
    files_to_delete = json.load(fh)

api = HfApi(token=token)

# 分批删除,每批最多 100 个文件(避免单次 commit 过大)
BATCH_SIZE = 100
total = len(files_to_delete)
total_batches = (total + BATCH_SIZE - 1) // BATCH_SIZE
deleted_total = 0

for batch_idx in range(total_batches):
    start = batch_idx * BATCH_SIZE
    end = min(start + BATCH_SIZE, total)
    batch = files_to_delete[start:end]

    try:
        api.delete_files(
            repo_id=repo_id,
            repo_type=repo_type,
            paths=batch,
            commit_message=f"backup cleanup: batch {batch_idx + 1}/{total_batches} ({len(batch)} files, date {delete_date})"
        )
        deleted_total += len(batch)
        print(f"  Batch {batch_idx + 1}/{total_batches}: deleted {len(batch)} files")
    except Exception as e:
        print(f"  Batch {batch_idx + 1}/{total_batches}: FAILED - {e}")
        print(f"\nResult: deleted {deleted_total}/{total} files before failure")
        sys.exit(1)

print(f"\nResult: deleted all {total} files in {total_batches} batch(es)")
PYEOF_BATCH

else
    echo "Mode: INDIVIDUAL DELETE (one by one with retry)"
    echo ""

    $PYTHON << PYEOF_INDIVIDUAL
import json, sys, time

token = """${HF_TOKEN}"""
repo_id = """${DATASET_REPO}"""
repo_type = """${REPO_TYPE}"""
delete_date = """${DELETE_DATE}"""

with open("${MATCHED_FILES_FILE//\\///}", 'r') as fh:
    files_to_delete = json.load(fh)

from huggingface_hub import HfApi
api = HfApi(token=token)

total = len(files_to_delete)
deleted = 0
failed = 0
rate_limit_hits = 0
max_retries = 3

for idx, path in enumerate(sorted(files_to_delete), 1):
    retries = 0
    while retries < max_retries:
        try:
            api.delete_file(
                repo_id=repo_id,
                repo_type=repo_type,
                path_in_repo=path,
                commit_message=f"backup cleanup: delete {path} (date {delete_date})"
            )
            deleted += 1
            print(f"  [{idx}/{total}] Deleted: {path}")
            break
        except Exception as e:
            err = str(e)
            if "404" in err or "Entry Not Found" in err:
                print(f"  [{idx}/{total}] Skipped (not found): {path}")
                deleted += 1
                break
            elif "429" in err or "Too Many Requests" in err:
                retries += 1
                rate_limit_hits += 1
                if retries >= max_retries:
                    print(f"  [{idx}/{total}] Failed (rate limit): {path}")
                    failed += 1
                    break
                wait = min(2 ** retries * 15, 1800)
                print(f"  [{idx}/{total}] Rate limited, waiting {wait}s...")
                time.sleep(wait)
                continue
            else:
                print(f"  [{idx}/{total}] Failed: {path} - {err}")
                failed += 1
                break

print(f"\nResult: deleted {deleted}, failed {failed}, rate limit hits {rate_limit_hits}")
PYEOF_INDIVIDUAL
fi

echo ""
echo "============================================"
echo "Done! Cleanup finished."
echo "============================================"