page / scripts /openclaw-backup-watchdog.sh
GGSheng's picture
feat: deploy Gemma 4 to hf space
a757bd3 verified
#!/usr/bin/env bash
#
# OpenClaw Backup Watchdog - 备份系统兜底保障脚本
# 功能:监控备份系统运行状态,在cron失效时提供多重保障
# 设计原则:
# 1. 只监控备份状态,不干扰正常备份流程
# 2. 检测到备份缺失时,仅触发备份脚本,不直接操作备份逻辑
# 3. 避免与恢复进程冲突
# 4. 防止并发执行(使用锁机制)
#
set -euo pipefail
# 配置
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BACKUP_LOG_DIR="${OPENCLAW_BACKUP_LOG_DIR:-/var/log/openclaw}"
BACKUP_LOG_FILE="${BACKUP_LOG_DIR}/backup.log"
WATCHDOG_LOG_FILE="${BACKUP_LOG_DIR}/watchdog.log"
PID_FILE="/var/run/openclaw-backup-watchdog.pid"
LOCK_FILE="/var/run/openclaw-backup-watchdog.lock"
# 默认配置(可通过环境变量覆盖)
WATCHDOG_INTERVAL="${WATCHDOG_INTERVAL:-600}" # 检查间隔:10分钟
MAX_BACKUP_AGE_MINUTES="${MAX_BACKUP_AGE_MINUTES:-30}" # 最大备份间隔:30分钟
FORCE_BACKUP_INTERVAL="${FORCE_BACKUP_INTERVAL:-14400}" # 强制备份间隔:4小时
# 颜色输出
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
# 日志函数 - 先确保日志文件存在
log_info() {
mkdir -p "$BACKUP_LOG_DIR" 2>/dev/null || true
local msg="[INFO] $(date '+%Y-%m-%d %H:%M:%S') $*"
echo -e "${GREEN}${msg}${NC}" | timeout 1 tee -a "$WATCHDOG_LOG_FILE" 2>/dev/null || echo "$msg"
}
log_warn() {
mkdir -p "$BACKUP_LOG_DIR" 2>/dev/null || true
local msg="[WARN] $(date '+%Y-%m-%d %H:%M:%S') $*"
echo -e "${YELLOW}${msg}${NC}" | timeout 1 tee -a "$WATCHDOG_LOG_FILE" 2>/dev/null || echo "$msg"
}
log_error() {
mkdir -p "$BACKUP_LOG_DIR" 2>/dev/null || true
local msg="[ERROR] $(date '+%Y-%m-%d %H:%M:%S') $*"
echo -e "${RED}${msg}${NC}" | timeout 1 tee -a "$WATCHDOG_LOG_FILE" 2>/dev/null || echo "$msg"
}
is_true() {
local value
value="$(printf '%s' "${1:-}" | tr '[:upper:]' '[:lower:]')"
[[ "$value" == "1" || "$value" == "true" || "$value" == "yes" || "$value" == "on" ]]
}
# 获取最后一次备份时间(从日志解析)
get_last_backup_time() {
if [[ ! -f "$BACKUP_LOG_FILE" ]]; then
echo "0"
return
fi
# 从日志中提取最后一次成功备份的时间戳
# 匹配 backup.py 输出的成功日志格式:
# [2026-04-24T04:00:02] Full backup complete: X files, Y.MB
# [2026-04-24T04:00:02] Incremental backup complete: X files, Y.MB
# [WATCHDOG] [2026-04-24T04:00:02] Full backup complete: X files, Y.MB
# [WATCHDOG] [2026-04-24T04:00:02] Incremental backup complete: X files, Y.MB
local last_backup
last_backup=$(grep -iE "backup complete|backup uploaded" "$BACKUP_LOG_FILE" 2>/dev/null | tail -1)
if [[ -z "$last_backup" ]]; then
echo "0"
return
fi
# 尝试多种时间戳格式
local log_time=""
# 格式1: [2026-01-15T10:30:00] (cron脚本添加的前缀)
log_time=$(echo "$last_backup" | grep -oE '\[[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\]' | head -1 | tr -d '[]')
# 格式2: [WATCHDOG] [2026-01-15T10:30:00] (watchdog触发时的双重前缀)
if [[ -z "$log_time" ]]; then
log_time=$(echo "$last_backup" | grep -oE '\[[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\]' | tail -1 | tr -d '[]')
fi
# 格式3: backup.py 直接输出的 ISO 格式 (无括号)
if [[ -z "$log_time" ]]; then
log_time=$(echo "$last_backup" | grep -oE '[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}' | head -1)
fi
if [[ -n "$log_time" ]]; then
date -d "$log_time" +%s 2>/dev/null || echo "0"
else
echo "0"
fi
}
check_backup_or_restore_running() {
local my_pid="$$"
local my_ppid="$PPID"
local processes
processes=$(pgrep -af "backup\.py|openclaw-backup-cron|openclaw-restore" 2>/dev/null || true)
while IFS= read -r line; do
[[ -z "$line" ]] && continue
[[ "$line" == *"$my_pid"* ]] && continue
[[ "$line" == *"$my_ppid"* ]] && continue
local pid
pid=$(echo "$line" | awk '{print $1}')
[[ "$pid" == "$my_pid" ]] || [[ "$pid" == "$my_ppid" ]] && continue
if [[ "$line" =~ backup\.py ]]; then
local args
args=$(echo "$line" | sed 's/.*backup\.py//' | tr -s ' ')
if [[ "$args" =~ (^|[\s/])(backup|restore)($|[\s/]) ]] || [[ "$args" =~ \-\-command= ]]; then
return 0
fi
fi
if [[ "$line" =~ openclaw-backup-cron ]] && [[ "$line" != *watchdog* ]]; then
return 0
fi
if [[ "$line" =~ openclaw-restore ]]; then
return 0
fi
done <<< "$processes"
return 1
}
# 检查是否有恢复操作正在进行(从日志判断)
# 临界情况:恢复可能正在进行但还没写入日志
# 解决方案:同时检查进程和日志
check_restore_in_progress() {
# 首先检查恢复进程
if pgrep -af "backup.py" 2>/dev/null | grep -v "grep" | grep -qE "backup\.py\s+restore"; then
return 0
fi
# 然后检查恢复日志
local restore_log="${BACKUP_LOG_DIR}/restore.log"
if [[ -f "$restore_log" ]]; then
local last_restore
last_restore=$(grep -E "Starting restore|Restore completed" "$restore_log" 2>/dev/null | tail -1)
if [[ "$last_restore" == *"Starting restore"* ]]; then
# 检查日志时间,如果超过2小时还没有完成,认为恢复已失败
local log_time
log_time=$(echo "$last_restore" | grep -oE '\[?[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\]?' | head -1 | tr -d '[]')
if [[ -n "$log_time" ]]; then
local restore_start
restore_start=$(date -d "$log_time" +%s 2>/dev/null || echo "0")
local current_time
current_time=$(date +%s)
local elapsed_minutes=$(( (current_time - restore_start) / 60 ))
if [[ $elapsed_minutes -lt 120 ]]; then
return 0 # 恢复进行中且未超时
else
log_warn "Restore appears to be stuck (${elapsed_minutes}m), treating as completed"
return 1
fi
fi
return 0
fi
fi
return 1
}
# 检查cron是否运行
check_cron_running() {
if pgrep -x "cron" >/dev/null 2>&1; then
return 0
fi
# 也检查crond
if pgrep -x "crond" >/dev/null 2>&1; then
return 0
fi
return 1
}
# 获取锁(防止并发执行)
# 使用 flock 实现进程间互斥
acquire_lock() {
local timeout="${1:-5}"
exec 200>"$LOCK_FILE" 2>/dev/null || {
log_warn "Cannot create lock file, proceeding without lock"
return 0
}
if ! flock -n 200 2>/dev/null; then
log_info "Another watchdog instance is running, waiting for lock..."
if ! flock -w "$timeout" 200 2>/dev/null; then
log_warn "Could not acquire lock within ${timeout}s, skipping this check"
return 1
fi
fi
return 0
}
# 释放锁
release_lock() {
flock -u 200 2>/dev/null || true
exec 200>&- 2>/dev/null || true
}
# 执行备份(通过调用标准备份脚本,保持行为一致)
# 关键:使用锁防止并发执行
force_backup() {
# 第一步检查:快速路径 - 如果有备份/恢复进程在运行,直接跳过
# 这样可以避免不必要的锁等待
if check_backup_or_restore_running; then
log_info "Watchdog: Backup/restore process is running, skipping"
return 0
fi
if check_restore_in_progress; then
log_info "Watchdog: Restore operation is in progress, skipping"
return 0
fi
# 第二步:尝试获取锁
if ! acquire_lock 30; then
log_warn "Could not acquire lock for backup, skipping"
return 1
fi
# 获取锁后再次检查 - 因为在等待锁期间状态可能发生变化
if check_backup_or_restore_running; then
log_info "Watchdog: Backup/restore process started while waiting for lock, skipping"
release_lock
return 0
fi
if check_restore_in_progress; then
log_info "Watchdog: Restore operation started while waiting for lock, skipping"
release_lock
return 0
fi
log_warn "Watchdog: Backup is overdue, triggering backup via cron script..."
# 执行备份(调用标准备份脚本,保持行为一致)
local backup_start
backup_start=$(date +%s)
# Watchdog 触发备份时,同时记录到 watchdog.log 和 backup.log
# 避免与正在写入的 cron 输出冲突,使用 tee 追加
local cron_output exit_code=0
cron_output=$(/usr/local/bin/openclaw-backup-cron.sh 2>&1) || exit_code=$?
if [[ -n "$cron_output" ]]; then
echo "$cron_output" | while IFS= read -r line; do
echo "[WATCHDOG] $line" | timeout 1 tee -a "$WATCHDOG_LOG_FILE" 2>/dev/null || echo "[WATCHDOG] $line" >> "$WATCHDOG_LOG_FILE" 2>/dev/null || true
done
echo "$cron_output" | while IFS= read -r line; do
echo "[WATCHDOG] $line" | timeout 1 tee -a "$BACKUP_LOG_FILE" 2>/dev/null || echo "[WATCHDOG] $line" >> "$BACKUP_LOG_FILE" 2>/dev/null || true
done
fi
local backup_end
backup_end=$(date +%s)
local duration=$((backup_end - backup_start))
if [[ $exit_code -eq 0 ]]; then
log_info "Watchdog: Backup completed successfully (${duration}s)"
else
log_error "Watchdog: Backup failed (exit code: $exit_code)"
fi
release_lock
return $exit_code
}
# 检查备份健康状态
check_backup_health() {
local last_backup_time
last_backup_time=$(get_last_backup_time)
local current_time
current_time=$(date +%s)
if [[ "$last_backup_time" == "0" ]]; then
log_warn "Watchdog: No previous backup found in logs"
return 1
fi
local age_minutes=$(( (current_time - last_backup_time) / 60 ))
if [[ $age_minutes -gt $MAX_BACKUP_AGE_MINUTES ]]; then
log_warn "Watchdog: Last backup was ${age_minutes} minutes ago (threshold: ${MAX_BACKUP_AGE_MINUTES})"
return 1
fi
log_info "Watchdog: Backup health OK (last: ${age_minutes}m ago)"
return 0
}
# 主监控循环
watchdog_loop() {
log_info "========================================"
log_info "OpenClaw Backup Watchdog Started"
log_info "========================================"
log_info "Check interval: ${WATCHDOG_INTERVAL}s"
log_info "Max backup age: ${MAX_BACKUP_AGE_MINUTES} minutes"
log_info "Force backup interval: ${FORCE_BACKUP_INTERVAL}s"
log_info "Note: Watchdog only triggers backup when needed, respects running processes"
local last_forced_backup=0
local check_count=0
local consecutive_failures=0
local max_consecutive_failures=3
while true; do
check_count=$((check_count + 1))
local current_time
current_time=$(date +%s)
# 静默模式:只有需要记录时才输出
local need_backup=false
local skip_reason=""
# 1. 检查是否有备份/恢复进程在运行
if check_backup_or_restore_running; then
skip_reason="backup/restore process running"
consecutive_failures=0 # 重置失败计数
# 2. 检查是否有恢复正在进行
elif check_restore_in_progress; then
skip_reason="restore in progress"
consecutive_failures=0 # 重置失败计数
# 3. 检查备份健康状态
elif ! check_backup_health; then
need_backup=true
fi
# 4. 检查是否超过强制备份间隔(仅在健康检查通过时)
if [[ "$need_backup" == "false" && -z "$skip_reason" ]]; then
local time_since_forced=$((current_time - last_forced_backup))
if [[ $time_since_forced -ge $FORCE_BACKUP_INTERVAL ]]; then
log_info "Watchdog: Force backup interval reached (${time_since_forced}s)"
need_backup=true
fi
fi
# 5. 执行备份(如果需要且可以执行)
if [[ "$need_backup" == "true" ]]; then
if force_backup; then
last_forced_backup=$(date +%s)
consecutive_failures=0
else
consecutive_failures=$((consecutive_failures + 1))
if [[ $consecutive_failures -ge $max_consecutive_failures ]]; then
log_error "Watchdog: Backup failed ${consecutive_failures} consecutive times, backing off"
# 增加检查间隔,避免频繁失败
sleep $((WATCHDOG_INTERVAL * 2))
consecutive_failures=0
continue
fi
fi
elif [[ -n "$skip_reason" ]]; then
# 每10次检查输出一次跳过原因(避免日志过多)
if (( check_count % 10 == 1 )); then
log_info "Watchdog: Check #${check_count} - Skipped (${skip_reason})"
fi
fi
# 等待下一次检查
sleep "$WATCHDOG_INTERVAL"
done
}
# 清理函数
cleanup() {
log_info "Watchdog shutting down..."
release_lock 2>/dev/null || true
rm -f "$PID_FILE"
exit 0
}
# 信号处理
trap cleanup SIGTERM SIGINT
# 主函数
main() {
# 检查是否已经在运行
if [[ -f "$PID_FILE" ]]; then
local old_pid
old_pid=$(cat "$PID_FILE" 2>/dev/null || echo "0")
if [[ "$old_pid" != "0" ]] && kill -0 "$old_pid" 2>/dev/null; then
log_error "Watchdog is already running (PID: $old_pid)"
exit 1
else
rm -f "$PID_FILE"
fi
fi
# 创建日志目录(如果不存在)
mkdir -p "$BACKUP_LOG_DIR" 2>/dev/null || {
echo "Warning: Cannot create log directory $BACKUP_LOG_DIR" >&2
}
# Check if backup is enabled
if ! is_true "${OPENCLAW_BACKUP_ENABLED:-false}"; then
log_info "Watchdog: Backup is disabled (OPENCLAW_BACKUP_ENABLED=false), exiting"
exit 0
fi
# 写入PID文件
echo $$ > "$PID_FILE" 2>/dev/null || {
log_warn "Cannot write PID file $PID_FILE (continuing anyway)"
}
# 启动监控循环
watchdog_loop
}
# 后台运行模式
daemon_mode() {
log_info "Starting watchdog in daemon mode..."
nohup "$0" >> "$WATCHDOG_LOG_FILE" 2>&1 &
local pid=$!
echo $pid > "$PID_FILE"
log_info "Watchdog daemon started (PID: $pid)"
}
# 命令行处理
case "${1:-}" in
-d|--daemon)
daemon_mode
;;
-s|--stop)
if [[ -f "$PID_FILE" ]]; then
pid=$(cat "$PID_FILE")
if kill "$pid" 2>/dev/null; then
echo "Watchdog stopped (PID: $pid)"
else
echo "Watchdog not running or already stopped"
fi
rm -f "$PID_FILE"
rm -f "$LOCK_FILE"
else
echo "Watchdog is not running (no PID file)"
fi
;;
-c|--check)
# 单次检查模式
echo "=== Watchdog Status Check ==="
echo "Cron running: $(check_cron_running && echo "YES" || echo "NO")"
echo "Backup/restore running: $(check_backup_or_restore_running && echo "YES" || echo "NO")"
echo "Restore in progress: $(check_restore_in_progress && echo "YES" || echo "NO")"
echo "Backup health: $(check_backup_health && echo "OK" || echo "NEEDS ATTENTION")"
echo "Last backup: $(get_last_backup_time | xargs -I {} date -d @{} '+%Y-%m-%d %H:%M:%S' 2>/dev/null || echo "Unknown")"
;;
-h|--help)
cat <<EOF
OpenClaw Backup Watchdog - 备份系统兜底保障
用法: $0 [选项]
选项:
-d, --daemon 后台守护模式运行
-s, --stop 停止守护进程
-c, --check 单次检查模式
-h, --help 显示帮助
环境变量:
WATCHDOG_INTERVAL 检查间隔(秒,默认600)
MAX_BACKUP_AGE_MINUTES 最大备份间隔(分钟,默认30)
FORCE_BACKUP_INTERVAL 强制备份间隔(秒,默认14400)
说明:
本脚本作为备份系统的兜底保障,仅在检测到备份缺失且
没有其他备份/恢复进程运行时,才会触发备份。
不会干扰正常的备份和恢复流程。
关键特性:
- 使用文件锁防止并发执行
- 检测 backup.py backup 和 backup.py restore
- 恢复操作超时保护(2小时)
- 连续失败退避机制
日志位置: $WATCHDOG_LOG_FILE
EOF
;;
*)
main
;;
esac