| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| set -euo pipefail |
|
|
| |
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| BACKUP_LOG_DIR="${OPENCLAW_BACKUP_LOG_DIR:-/var/log/openclaw}" |
| BACKUP_LOG_FILE="${BACKUP_LOG_DIR}/backup.log" |
| WATCHDOG_LOG_FILE="${BACKUP_LOG_DIR}/watchdog.log" |
| PID_FILE="/var/run/openclaw-backup-watchdog.pid" |
| LOCK_FILE="/var/run/openclaw-backup-watchdog.lock" |
|
|
| |
| WATCHDOG_INTERVAL="${WATCHDOG_INTERVAL:-300}" |
| MAX_BACKUP_AGE_MINUTES="${MAX_BACKUP_AGE_MINUTES:-20}" |
| FORCE_BACKUP_INTERVAL="${FORCE_BACKUP_INTERVAL:-3600}" |
|
|
| |
| RED='\033[0;31m' |
| GREEN='\033[0;32m' |
| YELLOW='\033[1;33m' |
| BLUE='\033[0;34m' |
| NC='\033[0m' |
|
|
| |
| log_info() { |
| mkdir -p "$BACKUP_LOG_DIR" 2>/dev/null || true |
| local msg="[INFO] $(date '+%Y-%m-%d %H:%M:%S') $*" |
| echo -e "${GREEN}${msg}${NC}" | timeout 1 tee -a "$WATCHDOG_LOG_FILE" 2>/dev/null || echo "$msg" |
| } |
|
|
| log_warn() { |
| mkdir -p "$BACKUP_LOG_DIR" 2>/dev/null || true |
| local msg="[WARN] $(date '+%Y-%m-%d %H:%M:%S') $*" |
| echo -e "${YELLOW}${msg}${NC}" | timeout 1 tee -a "$WATCHDOG_LOG_FILE" 2>/dev/null || echo "$msg" |
| } |
|
|
| log_error() { |
| mkdir -p "$BACKUP_LOG_DIR" 2>/dev/null || true |
| local msg="[ERROR] $(date '+%Y-%m-%d %H:%M:%S') $*" |
| echo -e "${RED}${msg}${NC}" | timeout 1 tee -a "$WATCHDOG_LOG_FILE" 2>/dev/null || echo "$msg" |
| } |
|
|
| is_true() { |
| local value |
| value="$(printf '%s' "${1:-}" | tr '[:upper:]' '[:lower:]')" |
| [[ "$value" == "1" || "$value" == "true" || "$value" == "yes" || "$value" == "on" ]] |
| } |
|
|
| |
| get_last_backup_time() { |
| if [[ ! -f "$BACKUP_LOG_FILE" ]]; then |
| echo "0" |
| return |
| fi |
|
|
| |
| |
| |
| |
| |
| |
| local last_backup |
| last_backup=$(grep -E "backup uploaded:|Full backup complete|Incremental backup complete" "$BACKUP_LOG_FILE" 2>/dev/null | tail -1) |
|
|
| if [[ -z "$last_backup" ]]; then |
| echo "0" |
| return |
| fi |
|
|
| |
| local log_time |
| log_time=$(echo "$last_backup" | grep -oE '^\[[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\]' | head -1 | tr -d '[]') |
|
|
| |
| if [[ -z "$log_time" ]]; then |
| log_time=$(echo "$last_backup" | grep -oE '[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}' | head -1) |
| fi |
|
|
| if [[ -n "$log_time" ]]; then |
| date -d "$log_time" +%s 2>/dev/null || echo "0" |
| else |
| echo "0" |
| fi |
| } |
|
|
| check_backup_or_restore_running() { |
| local my_pid="$$" |
| local my_ppid="$PPID" |
|
|
| local processes |
| processes=$(pgrep -af "backup\.py|openclaw-backup-cron|openclaw-restore" 2>/dev/null || true) |
|
|
| while IFS= read -r line; do |
| [[ -z "$line" ]] && continue |
|
|
| [[ "$line" == *"$my_pid"* ]] && continue |
| [[ "$line" == *"$my_ppid"* ]] && continue |
|
|
| local pid |
| pid=$(echo "$line" | awk '{print $1}') |
| [[ "$pid" == "$my_pid" ]] || [[ "$pid" == "$my_ppid" ]] && continue |
|
|
| if [[ "$line" =~ backup\.py ]]; then |
| local args |
| args=$(echo "$line" | sed 's/.*backup\.py//' | tr -s ' ') |
| if [[ "$args" =~ (^|[\s/])(backup|restore)($|[\s/]) ]] || [[ "$args" =~ \-\-command= ]]; then |
| return 0 |
| fi |
| fi |
|
|
| if [[ "$line" =~ openclaw-backup-cron ]] && [[ "$line" != *watchdog* ]]; then |
| return 0 |
| fi |
|
|
| if [[ "$line" =~ openclaw-restore ]]; then |
| return 0 |
| fi |
| done <<< "$processes" |
|
|
| return 1 |
| } |
|
|
| |
| |
| |
| check_restore_in_progress() { |
| |
| if pgrep -af "backup.py" 2>/dev/null | grep -v "grep" | grep -qE "backup\.py\s+restore"; then |
| return 0 |
| fi |
| |
| |
| local restore_log="${BACKUP_LOG_DIR}/restore.log" |
| if [[ -f "$restore_log" ]]; then |
| local last_restore |
| last_restore=$(grep -E "Starting restore|Restore completed" "$restore_log" 2>/dev/null | tail -1) |
| if [[ "$last_restore" == *"Starting restore"* ]]; then |
| |
| local log_time |
| log_time=$(echo "$last_restore" | grep -oE '\[?[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\]?' | head -1 | tr -d '[]') |
| if [[ -n "$log_time" ]]; then |
| local restore_start |
| restore_start=$(date -d "$log_time" +%s 2>/dev/null || echo "0") |
| local current_time |
| current_time=$(date +%s) |
| local elapsed_minutes=$(( (current_time - restore_start) / 60 )) |
| |
| if [[ $elapsed_minutes -lt 120 ]]; then |
| return 0 |
| else |
| log_warn "Restore appears to be stuck (${elapsed_minutes}m), treating as completed" |
| return 1 |
| fi |
| fi |
| return 0 |
| fi |
| fi |
| return 1 |
| } |
|
|
| |
| check_cron_running() { |
| if pgrep -x "cron" >/dev/null 2>&1; then |
| return 0 |
| fi |
| |
| if pgrep -x "crond" >/dev/null 2>&1; then |
| return 0 |
| fi |
| return 1 |
| } |
|
|
| |
| |
| acquire_lock() { |
| local timeout="${1:-5}" |
|
|
| exec 200>"$LOCK_FILE" 2>/dev/null || { |
| log_warn "Cannot create lock file, proceeding without lock" |
| return 0 |
| } |
|
|
| if ! flock -n 200 2>/dev/null; then |
| log_info "Another watchdog instance is running, waiting for lock..." |
| if ! flock -w "$timeout" 200 2>/dev/null; then |
| log_warn "Could not acquire lock within ${timeout}s, skipping this check" |
| return 1 |
| fi |
| fi |
|
|
| return 0 |
| } |
|
|
| |
| release_lock() { |
| flock -u 200 2>/dev/null || true |
| exec 200>&- 2>/dev/null || true |
| } |
|
|
| |
| |
| force_backup() { |
| |
| |
| if check_backup_or_restore_running; then |
| log_info "Watchdog: Backup/restore process is running, skipping" |
| return 0 |
| fi |
|
|
| if check_restore_in_progress; then |
| log_info "Watchdog: Restore operation is in progress, skipping" |
| return 0 |
| fi |
|
|
| |
| if ! acquire_lock 30; then |
| log_warn "Could not acquire lock for backup, skipping" |
| return 1 |
| fi |
|
|
| |
| if check_backup_or_restore_running; then |
| log_info "Watchdog: Backup/restore process started while waiting for lock, skipping" |
| release_lock |
| return 0 |
| fi |
|
|
| if check_restore_in_progress; then |
| log_info "Watchdog: Restore operation started while waiting for lock, skipping" |
| release_lock |
| return 0 |
| fi |
|
|
| log_warn "Watchdog: Backup is overdue, triggering backup via cron script..." |
|
|
| |
| local backup_start |
| backup_start=$(date +%s) |
|
|
| |
| |
| local cron_output exit_code=0 |
| cron_output=$(/usr/local/bin/openclaw-backup-cron.sh 2>&1) || exit_code=$? |
|
|
| if [[ -n "$cron_output" ]]; then |
| echo "$cron_output" | while IFS= read -r line; do |
| echo "[WATCHDOG] $line" | timeout 1 tee -a "$WATCHDOG_LOG_FILE" 2>/dev/null || echo "[WATCHDOG] $line" >> "$WATCHDOG_LOG_FILE" 2>/dev/null || true |
| done |
| echo "$cron_output" | while IFS= read -r line; do |
| echo "[WATCHDOG] $line" | timeout 1 tee -a "$BACKUP_LOG_FILE" 2>/dev/null || echo "[WATCHDOG] $line" >> "$BACKUP_LOG_FILE" 2>/dev/null || true |
| done |
| fi |
|
|
| local backup_end |
| backup_end=$(date +%s) |
| local duration=$((backup_end - backup_start)) |
|
|
| if [[ $exit_code -eq 0 ]]; then |
| log_info "Watchdog: Backup completed successfully (${duration}s)" |
| else |
| log_error "Watchdog: Backup failed (exit code: $exit_code)" |
| fi |
|
|
| release_lock |
| return $exit_code |
| } |
|
|
| |
| check_backup_health() { |
| local last_backup_time |
| last_backup_time=$(get_last_backup_time) |
| local current_time |
| current_time=$(date +%s) |
| |
| if [[ "$last_backup_time" == "0" ]]; then |
| log_warn "Watchdog: No previous backup found in logs" |
| return 1 |
| fi |
| |
| local age_minutes=$(( (current_time - last_backup_time) / 60 )) |
| |
| if [[ $age_minutes -gt $MAX_BACKUP_AGE_MINUTES ]]; then |
| log_warn "Watchdog: Last backup was ${age_minutes} minutes ago (threshold: ${MAX_BACKUP_AGE_MINUTES})" |
| return 1 |
| fi |
| |
| log_info "Watchdog: Backup health OK (last: ${age_minutes}m ago)" |
| return 0 |
| } |
|
|
| |
| watchdog_loop() { |
| log_info "========================================" |
| log_info "OpenClaw Backup Watchdog Started" |
| log_info "========================================" |
| log_info "Check interval: ${WATCHDOG_INTERVAL}s" |
| log_info "Max backup age: ${MAX_BACKUP_AGE_MINUTES} minutes" |
| log_info "Force backup interval: ${FORCE_BACKUP_INTERVAL}s" |
| log_info "Note: Watchdog only triggers backup when needed, respects running processes" |
| |
| local last_forced_backup=0 |
| local check_count=0 |
| local consecutive_failures=0 |
| local max_consecutive_failures=3 |
| |
| while true; do |
| check_count=$((check_count + 1)) |
| local current_time |
| current_time=$(date +%s) |
| |
| |
| local need_backup=false |
| local skip_reason="" |
| |
| |
| if check_backup_or_restore_running; then |
| skip_reason="backup/restore process running" |
| consecutive_failures=0 |
| |
| elif check_restore_in_progress; then |
| skip_reason="restore in progress" |
| consecutive_failures=0 |
| |
| elif ! check_backup_health; then |
| need_backup=true |
| fi |
| |
| |
| if [[ "$need_backup" == "false" && -z "$skip_reason" ]]; then |
| local time_since_forced=$((current_time - last_forced_backup)) |
| if [[ $time_since_forced -ge $FORCE_BACKUP_INTERVAL ]]; then |
| log_info "Watchdog: Force backup interval reached (${time_since_forced}s)" |
| need_backup=true |
| fi |
| fi |
| |
| |
| if [[ "$need_backup" == "true" ]]; then |
| if force_backup; then |
| last_forced_backup=$(date +%s) |
| consecutive_failures=0 |
| else |
| consecutive_failures=$((consecutive_failures + 1)) |
| if [[ $consecutive_failures -ge $max_consecutive_failures ]]; then |
| log_error "Watchdog: Backup failed ${consecutive_failures} consecutive times, backing off" |
| |
| sleep $((WATCHDOG_INTERVAL * 2)) |
| consecutive_failures=0 |
| continue |
| fi |
| fi |
| elif [[ -n "$skip_reason" ]]; then |
| |
| if (( check_count % 10 == 1 )); then |
| log_info "Watchdog: Check #${check_count} - Skipped (${skip_reason})" |
| fi |
| fi |
| |
| |
| sleep "$WATCHDOG_INTERVAL" |
| done |
| } |
|
|
| |
| cleanup() { |
| log_info "Watchdog shutting down..." |
| release_lock 2>/dev/null || true |
| rm -f "$PID_FILE" |
| exit 0 |
| } |
|
|
| |
| trap cleanup SIGTERM SIGINT |
|
|
| |
| main() { |
| |
| if [[ -f "$PID_FILE" ]]; then |
| local old_pid |
| old_pid=$(cat "$PID_FILE" 2>/dev/null || echo "0") |
| if [[ "$old_pid" != "0" ]] && kill -0 "$old_pid" 2>/dev/null; then |
| log_error "Watchdog is already running (PID: $old_pid)" |
| exit 1 |
| else |
| rm -f "$PID_FILE" |
| fi |
| fi |
| |
| |
| mkdir -p "$BACKUP_LOG_DIR" 2>/dev/null || { |
| echo "Warning: Cannot create log directory $BACKUP_LOG_DIR" >&2 |
| } |
| |
| |
| if ! is_true "${OPENCLAW_BACKUP_ENABLED:-false}"; then |
| log_info "Watchdog: Backup is disabled (OPENCLAW_BACKUP_ENABLED=false), exiting" |
| exit 0 |
| fi |
|
|
| |
| echo $$ > "$PID_FILE" 2>/dev/null || { |
| log_warn "Cannot write PID file $PID_FILE (continuing anyway)" |
| } |
|
|
| |
| watchdog_loop |
| } |
|
|
| |
| daemon_mode() { |
| log_info "Starting watchdog in daemon mode..." |
| nohup "$0" >> "$WATCHDOG_LOG_FILE" 2>&1 & |
| local pid=$! |
| echo $pid > "$PID_FILE" |
| log_info "Watchdog daemon started (PID: $pid)" |
| } |
|
|
| |
| case "${1:-}" in |
| -d|--daemon) |
| daemon_mode |
| ;; |
| -s|--stop) |
| if [[ -f "$PID_FILE" ]]; then |
| pid=$(cat "$PID_FILE") |
| if kill "$pid" 2>/dev/null; then |
| echo "Watchdog stopped (PID: $pid)" |
| else |
| echo "Watchdog not running or already stopped" |
| fi |
| rm -f "$PID_FILE" |
| rm -f "$LOCK_FILE" |
| else |
| echo "Watchdog is not running (no PID file)" |
| fi |
| ;; |
| -c|--check) |
| |
| echo "=== Watchdog Status Check ===" |
| echo "Cron running: $(check_cron_running && echo "YES" || echo "NO")" |
| echo "Backup/restore running: $(check_backup_or_restore_running && echo "YES" || echo "NO")" |
| echo "Restore in progress: $(check_restore_in_progress && echo "YES" || echo "NO")" |
| echo "Backup health: $(check_backup_health && echo "OK" || echo "NEEDS ATTENTION")" |
| echo "Last backup: $(get_last_backup_time | xargs -I {} date -d @{} '+%Y-%m-%d %H:%M:%S' 2>/dev/null || echo "Unknown")" |
| ;; |
| -h|--help) |
| cat <<EOF |
| OpenClaw Backup Watchdog - 备份系统兜底保障 |
| |
| 用法: $0 [选项] |
| |
| 选项: |
| -d, --daemon 后台守护模式运行 |
| -s, --stop 停止守护进程 |
| -c, --check 单次检查模式 |
| -h, --help 显示帮助 |
| |
| 环境变量: |
| WATCHDOG_INTERVAL 检查间隔(秒,默认300) |
| MAX_BACKUP_AGE_MINUTES 最大备份间隔(分钟,默认30) |
| FORCE_BACKUP_INTERVAL 强制备份间隔(秒,默认3600) |
| |
| 说明: |
| 本脚本作为备份系统的兜底保障,仅在检测到备份缺失且 |
| 没有其他备份/恢复进程运行时,才会触发备份。 |
| 不会干扰正常的备份和恢复流程。 |
| |
| 关键特性: |
| - 使用文件锁防止并发执行 |
| - 检测 backup.py backup 和 backup.py restore |
| - 恢复操作超时保护(2小时) |
| - 连续失败退避机制 |
| |
| 日志位置: $WATCHDOG_LOG_FILE |
| EOF |
| ;; |
| *) |
| main |
| ;; |
| esac |
|
|