File size: 16,615 Bytes
17e971c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
#!/usr/bin/env bash
#
# OpenClaw Backup Watchdog - 备份系统兜底保障脚本
# 功能:监控备份系统运行状态,在cron失效时提供多重保障
# 设计原则:
#   1. 只监控备份状态,不干扰正常备份流程
#   2. 检测到备份缺失时,仅触发备份脚本,不直接操作备份逻辑
#   3. 避免与恢复进程冲突
#   4. 防止并发执行(使用锁机制)
#

set -euo pipefail

# 配置
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BACKUP_LOG_DIR="${OPENCLAW_BACKUP_LOG_DIR:-/var/log/openclaw}"
BACKUP_LOG_FILE="${BACKUP_LOG_DIR}/backup.log"
WATCHDOG_LOG_FILE="${BACKUP_LOG_DIR}/watchdog.log"
PID_FILE="/var/run/openclaw-backup-watchdog.pid"
LOCK_FILE="/var/run/openclaw-backup-watchdog.lock"

# 默认配置(可通过环境变量覆盖)
WATCHDOG_INTERVAL="${WATCHDOG_INTERVAL:-300}"           # 检查间隔:5分钟
MAX_BACKUP_AGE_MINUTES="${MAX_BACKUP_AGE_MINUTES:-20}"  # 最大备份间隔:20分钟
FORCE_BACKUP_INTERVAL="${FORCE_BACKUP_INTERVAL:-3600}"  # 强制备份间隔:1小时

# 颜色输出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'

# 日志函数 - 先确保日志文件存在
log_info() {
    mkdir -p "$BACKUP_LOG_DIR" 2>/dev/null || true
    local msg="[INFO] $(date '+%Y-%m-%d %H:%M:%S') $*"
    echo -e "${GREEN}${msg}${NC}" | timeout 1 tee -a "$WATCHDOG_LOG_FILE" 2>/dev/null || echo "$msg"
}

log_warn() {
    mkdir -p "$BACKUP_LOG_DIR" 2>/dev/null || true
    local msg="[WARN] $(date '+%Y-%m-%d %H:%M:%S') $*"
    echo -e "${YELLOW}${msg}${NC}" | timeout 1 tee -a "$WATCHDOG_LOG_FILE" 2>/dev/null || echo "$msg"
}

log_error() {
    mkdir -p "$BACKUP_LOG_DIR" 2>/dev/null || true
    local msg="[ERROR] $(date '+%Y-%m-%d %H:%M:%S') $*"
    echo -e "${RED}${msg}${NC}" | timeout 1 tee -a "$WATCHDOG_LOG_FILE" 2>/dev/null || echo "$msg"
}

is_true() {
    local value
    value="$(printf '%s' "${1:-}" | tr '[:upper:]' '[:lower:]')"
    [[ "$value" == "1" || "$value" == "true" || "$value" == "yes" || "$value" == "on" ]]
}

# 获取最后一次备份时间(从日志解析)
get_last_backup_time() {
    if [[ ! -f "$BACKUP_LOG_FILE" ]]; then
        echo "0"
        return
    fi

    # 从日志中提取最后一次成功备份的时间戳
    # 匹配 backup.py 输出的成功日志格式:
    #   [2026-04-24T04:00:02] backup uploaded: xxx
    #   [2026-04-24T04:00:02] Full backup complete: xxx
    #   [2026-04-24T04:00:02] Incremental backup complete: xxx
    # 注意:cron脚本会在每行开头添加 [timestamp],backup.py的输出在 >> 追加后也会带有cron的timestamp
    local last_backup
    last_backup=$(grep -E "backup uploaded:|Full backup complete|Incremental backup complete" "$BACKUP_LOG_FILE" 2>/dev/null | tail -1)

    if [[ -z "$last_backup" ]]; then
        echo "0"
        return
    fi

    # 解析日志时间戳 [2026-01-15T10:30:00] 格式(cron脚本添加的前缀)
    local log_time
    log_time=$(echo "$last_backup" | grep -oE '^\[[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\]' | head -1 | tr -d '[]')

    # 如果没有找到cron格式的时间戳,尝试从backup.py直接输出中解析
    if [[ -z "$log_time" ]]; then
        log_time=$(echo "$last_backup" | grep -oE '[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}' | head -1)
    fi

    if [[ -n "$log_time" ]]; then
        date -d "$log_time" +%s 2>/dev/null || echo "0"
    else
        echo "0"
    fi
}

check_backup_or_restore_running() {
    local my_pid="$$"
    local my_ppid="$PPID"

    local processes
    processes=$(pgrep -af "backup\.py|openclaw-backup-cron|openclaw-restore" 2>/dev/null || true)

    while IFS= read -r line; do
        [[ -z "$line" ]] && continue

        [[ "$line" == *"$my_pid"* ]] && continue
        [[ "$line" == *"$my_ppid"* ]] && continue

        local pid
        pid=$(echo "$line" | awk '{print $1}')
        [[ "$pid" == "$my_pid" ]] || [[ "$pid" == "$my_ppid" ]] && continue

        if [[ "$line" =~ backup\.py ]]; then
            local args
            args=$(echo "$line" | sed 's/.*backup\.py//' | tr -s ' ')
            if [[ "$args" =~ (^|[\s/])(backup|restore)($|[\s/]) ]] || [[ "$args" =~ \-\-command= ]]; then
                return 0
            fi
        fi

        if [[ "$line" =~ openclaw-backup-cron ]] && [[ "$line" != *watchdog* ]]; then
            return 0
        fi

        if [[ "$line" =~ openclaw-restore ]]; then
            return 0
        fi
    done <<< "$processes"

    return 1
}

# 检查是否有恢复操作正在进行(从日志判断)
# 临界情况:恢复可能正在进行但还没写入日志
# 解决方案:同时检查进程和日志
check_restore_in_progress() {
    # 首先检查恢复进程
    if pgrep -af "backup.py" 2>/dev/null | grep -v "grep" | grep -qE "backup\.py\s+restore"; then
        return 0
    fi
    
    # 然后检查恢复日志
    local restore_log="${BACKUP_LOG_DIR}/restore.log"
    if [[ -f "$restore_log" ]]; then
        local last_restore
        last_restore=$(grep -E "Starting restore|Restore completed" "$restore_log" 2>/dev/null | tail -1)
        if [[ "$last_restore" == *"Starting restore"* ]]; then
            # 检查日志时间,如果超过2小时还没有完成,认为恢复已失败
            local log_time
            log_time=$(echo "$last_restore" | grep -oE '\[?[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\]?' | head -1 | tr -d '[]')
            if [[ -n "$log_time" ]]; then
                local restore_start
                restore_start=$(date -d "$log_time" +%s 2>/dev/null || echo "0")
                local current_time
                current_time=$(date +%s)
                local elapsed_minutes=$(( (current_time - restore_start) / 60 ))
                
                if [[ $elapsed_minutes -lt 120 ]]; then
                    return 0  # 恢复进行中且未超时
                else
                    log_warn "Restore appears to be stuck (${elapsed_minutes}m), treating as completed"
                    return 1
                fi
            fi
            return 0
        fi
    fi
    return 1
}

# 检查cron是否运行
check_cron_running() {
    if pgrep -x "cron" >/dev/null 2>&1; then
        return 0
    fi
    # 也检查crond
    if pgrep -x "crond" >/dev/null 2>&1; then
        return 0
    fi
    return 1
}

# 获取锁(防止并发执行)
# 使用 flock 实现进程间互斥
acquire_lock() {
    local timeout="${1:-5}"

    exec 200>"$LOCK_FILE" 2>/dev/null || {
        log_warn "Cannot create lock file, proceeding without lock"
        return 0
    }

    if ! flock -n 200 2>/dev/null; then
        log_info "Another watchdog instance is running, waiting for lock..."
        if ! flock -w "$timeout" 200 2>/dev/null; then
            log_warn "Could not acquire lock within ${timeout}s, skipping this check"
            return 1
        fi
    fi

    return 0
}

# 释放锁
release_lock() {
    flock -u 200 2>/dev/null || true
    exec 200>&- 2>/dev/null || true
}

# 执行备份(通过调用标准备份脚本,保持行为一致)
# 关键:使用锁防止并发执行
force_backup() {
    # 第一步检查:快速路径 - 如果有备份/恢复进程在运行,直接跳过
    # 这样可以避免不必要的锁等待
    if check_backup_or_restore_running; then
        log_info "Watchdog: Backup/restore process is running, skipping"
        return 0
    fi

    if check_restore_in_progress; then
        log_info "Watchdog: Restore operation is in progress, skipping"
        return 0
    fi

    # 第二步:尝试获取锁
    if ! acquire_lock 30; then
        log_warn "Could not acquire lock for backup, skipping"
        return 1
    fi

    # 获取锁后再次检查 - 因为在等待锁期间状态可能发生变化
    if check_backup_or_restore_running; then
        log_info "Watchdog: Backup/restore process started while waiting for lock, skipping"
        release_lock
        return 0
    fi

    if check_restore_in_progress; then
        log_info "Watchdog: Restore operation started while waiting for lock, skipping"
        release_lock
        return 0
    fi

    log_warn "Watchdog: Backup is overdue, triggering backup via cron script..."

    # 执行备份(调用标准备份脚本,保持行为一致)
    local backup_start
    backup_start=$(date +%s)

    # Watchdog 触发备份时,同时记录到 watchdog.log 和 backup.log
    # 避免与正在写入的 cron 输出冲突,使用 tee 追加
    local cron_output exit_code=0
    cron_output=$(/usr/local/bin/openclaw-backup-cron.sh 2>&1) || exit_code=$?

    if [[ -n "$cron_output" ]]; then
        echo "$cron_output" | while IFS= read -r line; do
            echo "[WATCHDOG] $line" | timeout 1 tee -a "$WATCHDOG_LOG_FILE" 2>/dev/null || echo "[WATCHDOG] $line" >> "$WATCHDOG_LOG_FILE" 2>/dev/null || true
        done
        echo "$cron_output" | while IFS= read -r line; do
            echo "[WATCHDOG] $line" | timeout 1 tee -a "$BACKUP_LOG_FILE" 2>/dev/null || echo "[WATCHDOG] $line" >> "$BACKUP_LOG_FILE" 2>/dev/null || true
        done
    fi

    local backup_end
    backup_end=$(date +%s)
    local duration=$((backup_end - backup_start))

    if [[ $exit_code -eq 0 ]]; then
        log_info "Watchdog: Backup completed successfully (${duration}s)"
    else
        log_error "Watchdog: Backup failed (exit code: $exit_code)"
    fi

    release_lock
    return $exit_code
}

# 检查备份健康状态
check_backup_health() {
    local last_backup_time
    last_backup_time=$(get_last_backup_time)
    local current_time
    current_time=$(date +%s)
    
    if [[ "$last_backup_time" == "0" ]]; then
        log_warn "Watchdog: No previous backup found in logs"
        return 1
    fi
    
    local age_minutes=$(( (current_time - last_backup_time) / 60 ))
    
    if [[ $age_minutes -gt $MAX_BACKUP_AGE_MINUTES ]]; then
        log_warn "Watchdog: Last backup was ${age_minutes} minutes ago (threshold: ${MAX_BACKUP_AGE_MINUTES})"
        return 1
    fi
    
    log_info "Watchdog: Backup health OK (last: ${age_minutes}m ago)"
    return 0
}

# 主监控循环
watchdog_loop() {
    log_info "========================================"
    log_info "OpenClaw Backup Watchdog Started"
    log_info "========================================"
    log_info "Check interval: ${WATCHDOG_INTERVAL}s"
    log_info "Max backup age: ${MAX_BACKUP_AGE_MINUTES} minutes"
    log_info "Force backup interval: ${FORCE_BACKUP_INTERVAL}s"
    log_info "Note: Watchdog only triggers backup when needed, respects running processes"
    
    local last_forced_backup=0
    local check_count=0
    local consecutive_failures=0
    local max_consecutive_failures=3
    
    while true; do
        check_count=$((check_count + 1))
        local current_time
        current_time=$(date +%s)
        
        # 静默模式:只有需要记录时才输出
        local need_backup=false
        local skip_reason=""
        
        # 1. 检查是否有备份/恢复进程在运行
        if check_backup_or_restore_running; then
            skip_reason="backup/restore process running"
            consecutive_failures=0  # 重置失败计数
        # 2. 检查是否有恢复正在进行
        elif check_restore_in_progress; then
            skip_reason="restore in progress"
            consecutive_failures=0  # 重置失败计数
        # 3. 检查备份健康状态
        elif ! check_backup_health; then
            need_backup=true
        fi
        
        # 4. 检查是否超过强制备份间隔(仅在健康检查通过时)
        if [[ "$need_backup" == "false" && -z "$skip_reason" ]]; then
            local time_since_forced=$((current_time - last_forced_backup))
            if [[ $time_since_forced -ge $FORCE_BACKUP_INTERVAL ]]; then
                log_info "Watchdog: Force backup interval reached (${time_since_forced}s)"
                need_backup=true
            fi
        fi
        
        # 5. 执行备份(如果需要且可以执行)
        if [[ "$need_backup" == "true" ]]; then
            if force_backup; then
                last_forced_backup=$(date +%s)
                consecutive_failures=0
            else
                consecutive_failures=$((consecutive_failures + 1))
                if [[ $consecutive_failures -ge $max_consecutive_failures ]]; then
                    log_error "Watchdog: Backup failed ${consecutive_failures} consecutive times, backing off"
                    # 增加检查间隔,避免频繁失败
                    sleep $((WATCHDOG_INTERVAL * 2))
                    consecutive_failures=0
                    continue
                fi
            fi
        elif [[ -n "$skip_reason" ]]; then
            # 每10次检查输出一次跳过原因(避免日志过多)
            if (( check_count % 10 == 1 )); then
                log_info "Watchdog: Check #${check_count} - Skipped (${skip_reason})"
            fi
        fi
        
        # 等待下一次检查
        sleep "$WATCHDOG_INTERVAL"
    done
}

# 清理函数
cleanup() {
    log_info "Watchdog shutting down..."
    release_lock 2>/dev/null || true
    rm -f "$PID_FILE"
    exit 0
}

# 信号处理
trap cleanup SIGTERM SIGINT

# 主函数
main() {
    # 检查是否已经在运行
    if [[ -f "$PID_FILE" ]]; then
        local old_pid
        old_pid=$(cat "$PID_FILE" 2>/dev/null || echo "0")
        if [[ "$old_pid" != "0" ]] && kill -0 "$old_pid" 2>/dev/null; then
            log_error "Watchdog is already running (PID: $old_pid)"
            exit 1
        else
            rm -f "$PID_FILE"
        fi
    fi
    
    # 创建日志目录(如果不存在)
    mkdir -p "$BACKUP_LOG_DIR" 2>/dev/null || {
        echo "Warning: Cannot create log directory $BACKUP_LOG_DIR" >&2
    }
    
    # Check if backup is enabled
    if ! is_true "${OPENCLAW_BACKUP_ENABLED:-false}"; then
        log_info "Watchdog: Backup is disabled (OPENCLAW_BACKUP_ENABLED=false), exiting"
        exit 0
    fi

    # 写入PID文件
    echo $$ > "$PID_FILE" 2>/dev/null || {
        log_warn "Cannot write PID file $PID_FILE (continuing anyway)"
    }

    # 启动监控循环
    watchdog_loop
}

# 后台运行模式
daemon_mode() {
    log_info "Starting watchdog in daemon mode..."
    nohup "$0" >> "$WATCHDOG_LOG_FILE" 2>&1 &
    local pid=$!
    echo $pid > "$PID_FILE"
    log_info "Watchdog daemon started (PID: $pid)"
}

# 命令行处理
case "${1:-}" in
    -d|--daemon)
        daemon_mode
        ;;
    -s|--stop)
        if [[ -f "$PID_FILE" ]]; then
            pid=$(cat "$PID_FILE")
            if kill "$pid" 2>/dev/null; then
                echo "Watchdog stopped (PID: $pid)"
            else
                echo "Watchdog not running or already stopped"
            fi
            rm -f "$PID_FILE"
            rm -f "$LOCK_FILE"
        else
            echo "Watchdog is not running (no PID file)"
        fi
        ;;
    -c|--check)
        # 单次检查模式
        echo "=== Watchdog Status Check ==="
        echo "Cron running: $(check_cron_running && echo "YES" || echo "NO")"
        echo "Backup/restore running: $(check_backup_or_restore_running && echo "YES" || echo "NO")"
        echo "Restore in progress: $(check_restore_in_progress && echo "YES" || echo "NO")"
        echo "Backup health: $(check_backup_health && echo "OK" || echo "NEEDS ATTENTION")"
        echo "Last backup: $(get_last_backup_time | xargs -I {} date -d @{} '+%Y-%m-%d %H:%M:%S' 2>/dev/null || echo "Unknown")"
        ;;
    -h|--help)
        cat <<EOF
OpenClaw Backup Watchdog - 备份系统兜底保障

用法: $0 [选项]

选项:
    -d, --daemon    后台守护模式运行
    -s, --stop      停止守护进程
    -c, --check     单次检查模式
    -h, --help      显示帮助

环境变量:
    WATCHDOG_INTERVAL         检查间隔(秒,默认300)
    MAX_BACKUP_AGE_MINUTES    最大备份间隔(分钟,默认30)
    FORCE_BACKUP_INTERVAL     强制备份间隔(秒,默认3600)

说明:
    本脚本作为备份系统的兜底保障,仅在检测到备份缺失且
    没有其他备份/恢复进程运行时,才会触发备份。
    不会干扰正常的备份和恢复流程。

    关键特性:
    - 使用文件锁防止并发执行
    - 检测 backup.py backup 和 backup.py restore
    - 恢复操作超时保护(2小时)
    - 连续失败退避机制

日志位置: $WATCHDOG_LOG_FILE
EOF
        ;;
    *)
        main
        ;;
esac