#!/usr/bin/env bash # # OpenClaw Backup Watchdog - 备份系统兜底保障脚本 # 功能:监控备份系统运行状态,在cron失效时提供多重保障 # 设计原则: # 1. 只监控备份状态,不干扰正常备份流程 # 2. 检测到备份缺失时,仅触发备份脚本,不直接操作备份逻辑 # 3. 避免与恢复进程冲突 # 4. 防止并发执行(使用锁机制) # set -euo pipefail # 配置 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" BACKUP_LOG_DIR="${OPENCLAW_BACKUP_LOG_DIR:-/var/log/openclaw}" BACKUP_LOG_FILE="${BACKUP_LOG_DIR}/backup.log" WATCHDOG_LOG_FILE="${BACKUP_LOG_DIR}/watchdog.log" PID_FILE="/var/run/openclaw-backup-watchdog.pid" LOCK_FILE="/var/run/openclaw-backup-watchdog.lock" # 默认配置(可通过环境变量覆盖) WATCHDOG_INTERVAL="${WATCHDOG_INTERVAL:-300}" # 检查间隔:5分钟 MAX_BACKUP_AGE_MINUTES="${MAX_BACKUP_AGE_MINUTES:-20}" # 最大备份间隔:20分钟 FORCE_BACKUP_INTERVAL="${FORCE_BACKUP_INTERVAL:-3600}" # 强制备份间隔:1小时 # 颜色输出 RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # 日志函数 - 先确保日志文件存在 log_info() { mkdir -p "$BACKUP_LOG_DIR" 2>/dev/null || true local msg="[INFO] $(date '+%Y-%m-%d %H:%M:%S') $*" echo -e "${GREEN}${msg}${NC}" | timeout 1 tee -a "$WATCHDOG_LOG_FILE" 2>/dev/null || echo "$msg" } log_warn() { mkdir -p "$BACKUP_LOG_DIR" 2>/dev/null || true local msg="[WARN] $(date '+%Y-%m-%d %H:%M:%S') $*" echo -e "${YELLOW}${msg}${NC}" | timeout 1 tee -a "$WATCHDOG_LOG_FILE" 2>/dev/null || echo "$msg" } log_error() { mkdir -p "$BACKUP_LOG_DIR" 2>/dev/null || true local msg="[ERROR] $(date '+%Y-%m-%d %H:%M:%S') $*" echo -e "${RED}${msg}${NC}" | timeout 1 tee -a "$WATCHDOG_LOG_FILE" 2>/dev/null || echo "$msg" } is_true() { local value value="$(printf '%s' "${1:-}" | tr '[:upper:]' '[:lower:]')" [[ "$value" == "1" || "$value" == "true" || "$value" == "yes" || "$value" == "on" ]] } # 获取最后一次备份时间(从日志解析) get_last_backup_time() { if [[ ! -f "$BACKUP_LOG_FILE" ]]; then echo "0" return fi # 从日志中提取最后一次成功备份的时间戳 # 匹配 backup.py 输出的成功日志格式: # [2026-04-24T04:00:02] backup uploaded: xxx # [2026-04-24T04:00:02] Full backup complete: xxx # [2026-04-24T04:00:02] Incremental backup complete: xxx # 注意:cron脚本会在每行开头添加 [timestamp],backup.py的输出在 >> 追加后也会带有cron的timestamp local last_backup last_backup=$(grep -E "backup uploaded:|Full backup complete|Incremental backup complete" "$BACKUP_LOG_FILE" 2>/dev/null | tail -1) if [[ -z "$last_backup" ]]; then echo "0" return fi # 解析日志时间戳 [2026-01-15T10:30:00] 格式(cron脚本添加的前缀) local log_time log_time=$(echo "$last_backup" | grep -oE '^\[[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\]' | head -1 | tr -d '[]') # 如果没有找到cron格式的时间戳,尝试从backup.py直接输出中解析 if [[ -z "$log_time" ]]; then log_time=$(echo "$last_backup" | grep -oE '[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}' | head -1) fi if [[ -n "$log_time" ]]; then date -d "$log_time" +%s 2>/dev/null || echo "0" else echo "0" fi } check_backup_or_restore_running() { local my_pid="$$" local my_ppid="$PPID" local processes processes=$(pgrep -af "backup\.py|openclaw-backup-cron|openclaw-restore" 2>/dev/null || true) while IFS= read -r line; do [[ -z "$line" ]] && continue [[ "$line" == *"$my_pid"* ]] && continue [[ "$line" == *"$my_ppid"* ]] && continue local pid pid=$(echo "$line" | awk '{print $1}') [[ "$pid" == "$my_pid" ]] || [[ "$pid" == "$my_ppid" ]] && continue if [[ "$line" =~ backup\.py ]]; then local args args=$(echo "$line" | sed 's/.*backup\.py//' | tr -s ' ') if [[ "$args" =~ (^|[\s/])(backup|restore)($|[\s/]) ]] || [[ "$args" =~ \-\-command= ]]; then return 0 fi fi if [[ "$line" =~ openclaw-backup-cron ]] && [[ "$line" != *watchdog* ]]; then return 0 fi if [[ "$line" =~ openclaw-restore ]]; then return 0 fi done <<< "$processes" return 1 } # 检查是否有恢复操作正在进行(从日志判断) # 临界情况:恢复可能正在进行但还没写入日志 # 解决方案:同时检查进程和日志 check_restore_in_progress() { # 首先检查恢复进程 if pgrep -af "backup.py" 2>/dev/null | grep -v "grep" | grep -qE "backup\.py\s+restore"; then return 0 fi # 然后检查恢复日志 local restore_log="${BACKUP_LOG_DIR}/restore.log" if [[ -f "$restore_log" ]]; then local last_restore last_restore=$(grep -E "Starting restore|Restore completed" "$restore_log" 2>/dev/null | tail -1) if [[ "$last_restore" == *"Starting restore"* ]]; then # 检查日志时间,如果超过2小时还没有完成,认为恢复已失败 local log_time log_time=$(echo "$last_restore" | grep -oE '\[?[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\]?' | head -1 | tr -d '[]') if [[ -n "$log_time" ]]; then local restore_start restore_start=$(date -d "$log_time" +%s 2>/dev/null || echo "0") local current_time current_time=$(date +%s) local elapsed_minutes=$(( (current_time - restore_start) / 60 )) if [[ $elapsed_minutes -lt 120 ]]; then return 0 # 恢复进行中且未超时 else log_warn "Restore appears to be stuck (${elapsed_minutes}m), treating as completed" return 1 fi fi return 0 fi fi return 1 } # 检查cron是否运行 check_cron_running() { if pgrep -x "cron" >/dev/null 2>&1; then return 0 fi # 也检查crond if pgrep -x "crond" >/dev/null 2>&1; then return 0 fi return 1 } # 获取锁(防止并发执行) # 使用 flock 实现进程间互斥 acquire_lock() { local timeout="${1:-5}" exec 200>"$LOCK_FILE" 2>/dev/null || { log_warn "Cannot create lock file, proceeding without lock" return 0 } if ! flock -n 200 2>/dev/null; then log_info "Another watchdog instance is running, waiting for lock..." if ! flock -w "$timeout" 200 2>/dev/null; then log_warn "Could not acquire lock within ${timeout}s, skipping this check" return 1 fi fi return 0 } # 释放锁 release_lock() { flock -u 200 2>/dev/null || true exec 200>&- 2>/dev/null || true } # 执行备份(通过调用标准备份脚本,保持行为一致) # 关键:使用锁防止并发执行 force_backup() { # 第一步检查:快速路径 - 如果有备份/恢复进程在运行,直接跳过 # 这样可以避免不必要的锁等待 if check_backup_or_restore_running; then log_info "Watchdog: Backup/restore process is running, skipping" return 0 fi if check_restore_in_progress; then log_info "Watchdog: Restore operation is in progress, skipping" return 0 fi # 第二步:尝试获取锁 if ! acquire_lock 30; then log_warn "Could not acquire lock for backup, skipping" return 1 fi # 获取锁后再次检查 - 因为在等待锁期间状态可能发生变化 if check_backup_or_restore_running; then log_info "Watchdog: Backup/restore process started while waiting for lock, skipping" release_lock return 0 fi if check_restore_in_progress; then log_info "Watchdog: Restore operation started while waiting for lock, skipping" release_lock return 0 fi log_warn "Watchdog: Backup is overdue, triggering backup via cron script..." # 执行备份(调用标准备份脚本,保持行为一致) local backup_start backup_start=$(date +%s) # Watchdog 触发备份时,同时记录到 watchdog.log 和 backup.log # 避免与正在写入的 cron 输出冲突,使用 tee 追加 local cron_output exit_code=0 cron_output=$(/usr/local/bin/openclaw-backup-cron.sh 2>&1) || exit_code=$? if [[ -n "$cron_output" ]]; then echo "$cron_output" | while IFS= read -r line; do echo "[WATCHDOG] $line" | timeout 1 tee -a "$WATCHDOG_LOG_FILE" 2>/dev/null || echo "[WATCHDOG] $line" >> "$WATCHDOG_LOG_FILE" 2>/dev/null || true done echo "$cron_output" | while IFS= read -r line; do echo "[WATCHDOG] $line" | timeout 1 tee -a "$BACKUP_LOG_FILE" 2>/dev/null || echo "[WATCHDOG] $line" >> "$BACKUP_LOG_FILE" 2>/dev/null || true done fi local backup_end backup_end=$(date +%s) local duration=$((backup_end - backup_start)) if [[ $exit_code -eq 0 ]]; then log_info "Watchdog: Backup completed successfully (${duration}s)" else log_error "Watchdog: Backup failed (exit code: $exit_code)" fi release_lock return $exit_code } # 检查备份健康状态 check_backup_health() { local last_backup_time last_backup_time=$(get_last_backup_time) local current_time current_time=$(date +%s) if [[ "$last_backup_time" == "0" ]]; then log_warn "Watchdog: No previous backup found in logs" return 1 fi local age_minutes=$(( (current_time - last_backup_time) / 60 )) if [[ $age_minutes -gt $MAX_BACKUP_AGE_MINUTES ]]; then log_warn "Watchdog: Last backup was ${age_minutes} minutes ago (threshold: ${MAX_BACKUP_AGE_MINUTES})" return 1 fi log_info "Watchdog: Backup health OK (last: ${age_minutes}m ago)" return 0 } # 主监控循环 watchdog_loop() { log_info "========================================" log_info "OpenClaw Backup Watchdog Started" log_info "========================================" log_info "Check interval: ${WATCHDOG_INTERVAL}s" log_info "Max backup age: ${MAX_BACKUP_AGE_MINUTES} minutes" log_info "Force backup interval: ${FORCE_BACKUP_INTERVAL}s" log_info "Note: Watchdog only triggers backup when needed, respects running processes" local last_forced_backup=0 local check_count=0 local consecutive_failures=0 local max_consecutive_failures=3 while true; do check_count=$((check_count + 1)) local current_time current_time=$(date +%s) # 静默模式:只有需要记录时才输出 local need_backup=false local skip_reason="" # 1. 检查是否有备份/恢复进程在运行 if check_backup_or_restore_running; then skip_reason="backup/restore process running" consecutive_failures=0 # 重置失败计数 # 2. 检查是否有恢复正在进行 elif check_restore_in_progress; then skip_reason="restore in progress" consecutive_failures=0 # 重置失败计数 # 3. 检查备份健康状态 elif ! check_backup_health; then need_backup=true fi # 4. 检查是否超过强制备份间隔(仅在健康检查通过时) if [[ "$need_backup" == "false" && -z "$skip_reason" ]]; then local time_since_forced=$((current_time - last_forced_backup)) if [[ $time_since_forced -ge $FORCE_BACKUP_INTERVAL ]]; then log_info "Watchdog: Force backup interval reached (${time_since_forced}s)" need_backup=true fi fi # 5. 执行备份(如果需要且可以执行) if [[ "$need_backup" == "true" ]]; then if force_backup; then last_forced_backup=$(date +%s) consecutive_failures=0 else consecutive_failures=$((consecutive_failures + 1)) if [[ $consecutive_failures -ge $max_consecutive_failures ]]; then log_error "Watchdog: Backup failed ${consecutive_failures} consecutive times, backing off" # 增加检查间隔,避免频繁失败 sleep $((WATCHDOG_INTERVAL * 2)) consecutive_failures=0 continue fi fi elif [[ -n "$skip_reason" ]]; then # 每10次检查输出一次跳过原因(避免日志过多) if (( check_count % 10 == 1 )); then log_info "Watchdog: Check #${check_count} - Skipped (${skip_reason})" fi fi # 等待下一次检查 sleep "$WATCHDOG_INTERVAL" done } # 清理函数 cleanup() { log_info "Watchdog shutting down..." release_lock 2>/dev/null || true rm -f "$PID_FILE" exit 0 } # 信号处理 trap cleanup SIGTERM SIGINT # 主函数 main() { # 检查是否已经在运行 if [[ -f "$PID_FILE" ]]; then local old_pid old_pid=$(cat "$PID_FILE" 2>/dev/null || echo "0") if [[ "$old_pid" != "0" ]] && kill -0 "$old_pid" 2>/dev/null; then log_error "Watchdog is already running (PID: $old_pid)" exit 1 else rm -f "$PID_FILE" fi fi # 创建日志目录(如果不存在) mkdir -p "$BACKUP_LOG_DIR" 2>/dev/null || { echo "Warning: Cannot create log directory $BACKUP_LOG_DIR" >&2 } # Check if backup is enabled if ! is_true "${OPENCLAW_BACKUP_ENABLED:-false}"; then log_info "Watchdog: Backup is disabled (OPENCLAW_BACKUP_ENABLED=false), exiting" exit 0 fi # 写入PID文件 echo $$ > "$PID_FILE" 2>/dev/null || { log_warn "Cannot write PID file $PID_FILE (continuing anyway)" } # 启动监控循环 watchdog_loop } # 后台运行模式 daemon_mode() { log_info "Starting watchdog in daemon mode..." nohup "$0" >> "$WATCHDOG_LOG_FILE" 2>&1 & local pid=$! echo $pid > "$PID_FILE" log_info "Watchdog daemon started (PID: $pid)" } # 命令行处理 case "${1:-}" in -d|--daemon) daemon_mode ;; -s|--stop) if [[ -f "$PID_FILE" ]]; then pid=$(cat "$PID_FILE") if kill "$pid" 2>/dev/null; then echo "Watchdog stopped (PID: $pid)" else echo "Watchdog not running or already stopped" fi rm -f "$PID_FILE" rm -f "$LOCK_FILE" else echo "Watchdog is not running (no PID file)" fi ;; -c|--check) # 单次检查模式 echo "=== Watchdog Status Check ===" echo "Cron running: $(check_cron_running && echo "YES" || echo "NO")" echo "Backup/restore running: $(check_backup_or_restore_running && echo "YES" || echo "NO")" echo "Restore in progress: $(check_restore_in_progress && echo "YES" || echo "NO")" echo "Backup health: $(check_backup_health && echo "OK" || echo "NEEDS ATTENTION")" echo "Last backup: $(get_last_backup_time | xargs -I {} date -d @{} '+%Y-%m-%d %H:%M:%S' 2>/dev/null || echo "Unknown")" ;; -h|--help) cat <