#!/usr/bin/env bash # # OpenClaw Backup Health Check & Auto-Repair Script # 健康检查与自动修复脚本 # set -euo pipefail # Configuration SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" BACKUP_ENV_FILE_PATH="${OPENCLAW_BACKUP_ENV_FILE_PATH:-/root/.env.d/openclaw-backup.env}" BACKUP_LOG_DIR="${OPENCLAW_BACKUP_LOG_DIR:-/var/log/openclaw}" BACKUP_LOG_FILE="${BACKUP_LOG_DIR}/backup.log" HEALTH_LOG_FILE="${BACKUP_LOG_DIR}/health-check.log" MAX_BACKUP_AGE_MINUTES="${OPENCLAW_MAX_BACKUP_AGE_MINUTES:-30}" MAX_FAILED_ATTEMPTS="${OPENCLAW_MAX_FAILED_ATTEMPTS:-3}" FAILED_ATTEMPTS_FILE="${BACKUP_LOG_DIR}/.failed-attempts" # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # No Color # Logging functions log_info() { echo -e "${GREEN}[INFO]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" | tee -a "$HEALTH_LOG_FILE" } log_warn() { echo -e "${YELLOW}[WARN]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" | tee -a "$HEALTH_LOG_FILE" } log_error() { echo -e "${RED}[ERROR]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" | tee -a "$HEALTH_LOG_FILE" } log_debug() { if [[ "${OPENCLAW_DEBUG:-false}" == "true" ]]; then echo -e "${BLUE}[DEBUG]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" | tee -a "$HEALTH_LOG_FILE" fi } # Load environment load_env() { if [[ -f "$BACKUP_ENV_FILE_PATH" ]]; then # shellcheck disable=SC1090 source "$BACKUP_ENV_FILE_PATH" log_debug "Loaded environment from $BACKUP_ENV_FILE_PATH" else log_warn "Environment file not found: $BACKUP_ENV_FILE_PATH" fi } # Check if required commands exist check_commands() { local missing=() for cmd in python3 curl; do if ! command -v "$cmd" &> /dev/null; then missing+=("$cmd") fi done if [[ ${#missing[@]} -gt 0 ]]; then log_error "Missing required commands: ${missing[*]}" return 1 fi log_debug "All required commands available" return 0 } # Check environment variables check_env_vars() { local errors=0 # Required variables if [[ -z "${HF_TOKEN:-}" ]]; then log_error "HF_TOKEN is not set" ((errors++)) else log_debug "HF_TOKEN is set" fi if [[ -z "${OPENCLAW_BACKUP_DATASET_REPO:-}" ]]; then log_error "OPENCLAW_BACKUP_DATASET_REPO is not set" ((errors++)) else log_debug "Dataset repo: $OPENCLAW_BACKUP_DATASET_REPO" fi # Optional variables with defaults if [[ -z "${OPENCLAW_BACKUP_CRON:-}" ]]; then log_warn "OPENCLAW_BACKUP_CRON not set, using default: */10 * * * *" fi return $errors } # Check HuggingFace API connectivity check_hf_connectivity() { log_info "Checking HuggingFace connectivity..." if [[ "${OPENCLAW_BACKUP_HEALTH_CHECK_ENABLED:-false}" == "false" ]]; then log_info "Skipping HF connectivity check (OPENCLAW_BACKUP_HEALTH_CHECK_ENABLED=false, network checks disabled)" return 0 fi local max_retries=3 local retry_count=0 local curl_opts="-sSf" while [[ $retry_count -lt $max_retries ]]; do local http_code retry_after wait_time local response_headers response_headers=$(curl $curl_opts -D- \ --connect-timeout 10 \ --max-time 30 \ "https://huggingface.co/api/whoami" \ -H "Authorization: Bearer ${HF_TOKEN:-}" \ -o /dev/null 2>&1) || true http_code=$(echo "$response_headers" | tail -1) retry_after=$(echo "$response_headers" | grep -i "retry-after:" | awk '{print $2}' | tr -d '\r') if [[ "$http_code" == "200" ]]; then log_info "✅ HuggingFace API is accessible" return 0 elif [[ "$http_code" == "401" ]]; then log_warn "HF API returned 401 (Unauthorized) - token may be invalid" return 1 elif [[ "$http_code" == "429" ]]; then wait_time=60 if [[ -n "$retry_after" ]] && [[ "$retry_after" =~ ^[0-9]+$ ]]; then wait_time=$retry_after log_warn "HF API returned 429 (Rate Limited) - Retry-After: ${wait_time}s" else wait_time=$((60 * (2 ** retry_count))) log_warn "HF API returned 429 (Rate Limited) - using exponential backoff: ${wait_time}s" fi if [[ $retry_count -lt $((max_retries - 1)) ]]; then log_info "Waiting ${wait_time} seconds before retry $((retry_count + 2))/${max_retries}..." sleep "$wait_time" fi ((retry_count++)) continue fi ((retry_count++)) log_warn "HuggingFace API check failed (attempt $retry_count/$max_retries, code: $http_code)" sleep 5 done log_error "❌ Cannot connect to HuggingFace API" log_warn "If this is a network issue, you can set OPENCLAW_BACKUP_HEALTH_CHECK_ENABLED=false to skip" return 1 } # Check dataset access check_dataset_access() { local dataset="${OPENCLAW_BACKUP_DATASET_REPO:-}" if [[ -z "$dataset" ]]; then log_error "Cannot check dataset: OPENCLAW_BACKUP_DATASET_REPO not set" return 1 fi log_info "Checking dataset access: $dataset" local result result=$(python3 << PYEOF 2>&1 import sys import os sys.path.insert(0, '/opt/openclaw-hf') try: from huggingface_hub import HfApi api = HfApi(token=os.getenv('HF_TOKEN')) # Try to get dataset info info = api.dataset_info("$dataset") print(f"SUCCESS: Dataset exists, id={info.id}") # Try to list files files = list(api.list_repo_files("$dataset", repo_type='dataset')) backup_files = [f for f in files if 'openclaw-backup' in f] print(f"INFO: Found {len(backup_files)} backup files") except Exception as e: print(f"ERROR: {e}") sys.exit(1) PYEOF ) if [[ "$result" == SUCCESS:* ]]; then log_info "✅ Dataset access OK: ${result#SUCCESS: }" return 0 else log_error "❌ Dataset access failed: $result" return 1 fi } # Check backup script integrity check_backup_scripts() { local scripts=( "/usr/local/bin/openclaw-backup-cron.sh" "/usr/local/bin/openclaw-restore.sh" "/opt/openclaw-hf/openclaw_hf/backup.py" ) local errors=0 for script in "${scripts[@]}"; do if [[ ! -f "$script" ]]; then log_error "❌ Script not found: $script" ((errors++)) elif [[ ! -x "$script" ]] && [[ "$script" != *.py ]]; then log_warn "⚠️ Script not executable: $script" chmod +x "$script" 2>/dev/null || { log_error "❌ Cannot make $script executable" ((errors++)) } else log_debug "✅ Script OK: $script" fi done return $errors } # Check backup log for recent failures check_recent_backups() { log_info "Checking recent backup status..." if [[ ! -f "$BACKUP_LOG_FILE" ]]; then log_warn "⚠️ Backup log not found: $BACKUP_LOG_FILE" return 1 fi # Check for recent successful backup # Match actual success indicators, not error messages containing "backup complete" local last_success last_success=$(grep -iE "(backup uploaded|backup complete success)" "$BACKUP_LOG_FILE" 2>/dev/null | tail -1) if [[ -z "$last_success" ]]; then log_warn "⚠️ No successful backup found in logs" return 1 fi # Extract timestamp and calculate age # Supports formats: [2026-04-24T04:00:02], 2026-04-24T04:00:02, 2026-04-24 04:00:02 local log_time log_time=$(echo "$last_success" | grep -oE '[0-9]{4}-[0-9]{2}-[0-9]{2}[T ][0-9]{2}:[0-9]{2}:[0-9]{2}' | head -1) if [[ -n "$log_time" ]]; then local log_epoch now_epoch age_minutes # Normalize space separator to T for consistent parsing log_time=${log_time/ /T} log_epoch=$(date -d "$log_time" +%s 2>/dev/null) now_epoch=$(date +%s) age_minutes=$(( (now_epoch - log_epoch) / 60 )) if [[ $age_minutes -gt $MAX_BACKUP_AGE_MINUTES ]]; then log_warn "⚠️ Last successful backup was $age_minutes minutes ago (max: $MAX_BACKUP_AGE_MINUTES)" return 1 else log_info "✅ Last backup was $age_minutes minutes ago" fi fi # Check for recent errors local recent_errors recent_errors=$(grep -i "error\|failed\|exception" "$BACKUP_LOG_FILE" 2>/dev/null | tail -5) if [[ -n "$recent_errors" ]]; then log_warn "⚠️ Recent errors found in backup log:" echo "$recent_errors" | while read -r line; do log_warn " $line" done fi return 0 } # Check disk space check_disk_space() { log_info "Checking disk space..." local usage usage=$(df -h /root | awk 'NR==2 {print $5}' | tr -d '%') if [[ "$usage" -gt 90 ]]; then log_error "❌ Disk usage critical: ${usage}%" return 1 elif [[ "$usage" -gt 80 ]]; then log_warn "⚠️ Disk usage high: ${usage}%" else log_info "✅ Disk usage OK: ${usage}%" fi return 0 } # Get failed attempts count get_failed_attempts() { if [[ -f "$FAILED_ATTEMPTS_FILE" ]]; then cat "$FAILED_ATTEMPTS_FILE" else echo "0" fi } # Increment failed attempts increment_failed_attempts() { local current current=$(get_failed_attempts) echo $((current + 1)) > "$FAILED_ATTEMPTS_FILE" } # Reset failed attempts reset_failed_attempts() { echo "0" > "$FAILED_ATTEMPTS_FILE" } # Check if backup or restore is currently running check_backup_running() { local my_pid=$$ local my_ppid=$PPID local processes processes=$(pgrep -af "backup\.py|openclaw-backup-cron|openclaw-restore" 2>/dev/null || true) while IFS= read -r line; do [[ -z "$line" ]] && continue [[ "$line" == *"$my_pid"* ]] && continue [[ "$line" == *"$my_ppid"* ]] && continue local pid pid=$(echo "$line" | awk '{print $1}') [[ "$pid" == "$my_pid" ]] || [[ "$pid" == "$my_ppid" ]] && continue if [[ "$line" =~ backup\.py ]]; then local args args=$(echo "$line" | sed 's/.*backup\.py//' | tr -s ' ') if [[ "$args" =~ (^|[\s/])(backup|restore)($|[\s/]) ]] || [[ "$args" =~ \-\-command= ]]; then log_info "Backup/restore is currently running: $line" return 0 fi fi if [[ "$line" =~ openclaw-backup-cron ]] && [[ "$line" != *watchdog* ]]; then log_info "Backup cron is currently running: $line" return 0 fi if [[ "$line" =~ openclaw-restore ]]; then log_info "Restore is currently running: $line" return 0 fi done <<< "$processes" return 1 } # Repair function: Fix common issues repair_backup_system() { log_info "🔧 Attempting to repair backup system..." # Check if backup/restore is running before doing any repairs if check_backup_running; then log_warn "⚠️ Backup or restore is currently running, skipping repair to avoid interference" return 1 fi local repairs_made=0 # Repair 1: Fix script permissions log_info "Repair 1: Fixing script permissions..." chmod +x /usr/local/bin/openclaw-*.sh 2>/dev/null && { log_info "✅ Fixed script permissions" ((repairs_made++)) } # Repair 2: Recreate backup log directory if [[ ! -d "$BACKUP_LOG_DIR" ]]; then log_info "Repair 2: Creating backup log directory..." mkdir -p "$BACKUP_LOG_DIR" touch "$BACKUP_LOG_FILE" log_info "✅ Created backup log directory" ((repairs_made++)) fi # Repair 3: Fix log file permissions if [[ -f "$BACKUP_LOG_FILE" ]]; then chmod 644 "$BACKUP_LOG_FILE" fi # Repair 4: Clear Python cache (might fix import issues) log_info "Repair 4: Clearing Python cache..." find /opt/openclaw-hf -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true log_info "✅ Cleared Python cache" ((repairs_made++)) # Repair 5: Test backup script manually (dry-run only to avoid polluting remote backups) log_info "Repair 5: Testing backup script (dry-run)..." if check_backup_running; then log_warn "⚠️ Backup started while repair was in progress, skipping backup test" else local test_work_dir test_work_dir=$(mktemp -d /tmp/openclaw-backup-test-XXXXXX) if timeout 60 bash -c " source '$BACKUP_ENV_FILE_PATH' 2>/dev/null || true source /etc/profile.d/openclaw-env.sh 2>/dev/null || true export HF_TOKEN export OPENCLAW_BACKUP_DATASET_REPO='__health_check_test__invalid_repo__' export OPENCLAW_BACKUP_WORK_DIR='$test_work_dir' export OPENCLAW_BACKUP_KEEP_COUNT='1' python3 /opt/openclaw-hf/openclaw_hf/backup.py backup 2>&1 " >> "$BACKUP_LOG_FILE" 2>&1; then log_info "✅ Backup dry-run test successful" reset_failed_attempts ((repairs_made++)) else local test_exit_code=$? if [[ -n $(find "$test_work_dir" -name 'openclaw-backup-*.tar.gz' -print -quit 2>/dev/null) ]]; then log_info "✅ Backup dry-run test successful (archive created locally)" reset_failed_attempts ((repairs_made++)) else log_error "❌ Backup dry-run test failed (exit code: $test_exit_code)" increment_failed_attempts fi fi rm -rf "$test_work_dir" 2>/dev/null || true fi log_info "🔧 Repair complete. Repairs made: $repairs_made" return $((repairs_made == 0 ? 1 : 0)) } # Emergency repair: Reset everything emergency_repair() { log_warn "🚨 EMERGENCY REPAIR MODE" # Clear all caches rm -rf /root/.cache/huggingface 2>/dev/null || true # Reinstall huggingface_hub if needed python3 -m pip install --upgrade --force-reinstall "huggingface_hub[cli]>=0.31.1" 2>/dev/null || { log_error "Failed to reinstall huggingface_hub" return 1 } # Reset failed attempts reset_failed_attempts log_info "🚨 Emergency repair complete" return 0 } # Main health check run_health_check() { log_info "========================================" log_info "OpenClaw Backup Health Check Starting" log_info "========================================" load_env local total_checks=0 local passed_checks=0 local failed_checks=0 # Check 1: Commands ((total_checks++)) if check_commands; then ((passed_checks++)) else ((failed_checks++)) fi # Check 2: Environment variables ((total_checks++)) if check_env_vars; then ((passed_checks++)) else ((failed_checks++)) fi # Check 3: HF Connectivity ((total_checks++)) if check_hf_connectivity; then ((passed_checks++)) else ((failed_checks++)) fi # Check 4: Dataset access ((total_checks++)) if check_dataset_access; then ((passed_checks++)) else ((failed_checks++)) fi # Check 5: Scripts ((total_checks++)) if check_backup_scripts; then ((passed_checks++)) else ((failed_checks++)) fi # Check 6: Recent backups ((total_checks++)) if check_recent_backups; then ((passed_checks++)) else ((failed_checks++)) fi # Check 7: Disk space ((total_checks++)) if check_disk_space; then ((passed_checks++)) else ((failed_checks++)) fi # Summary log_info "========================================" log_info "Health Check Summary" log_info "========================================" log_info "Total checks: $total_checks" log_info "✅ Passed: $passed_checks" log_info "❌ Failed: $failed_checks" # Auto-repair if needed local failed_attempts failed_attempts=$(get_failed_attempts) if [[ $failed_checks -gt 0 ]]; then log_warn "Some checks failed. Failed attempts: $failed_attempts/$MAX_FAILED_ATTEMPTS" if [[ $failed_attempts -ge $MAX_FAILED_ATTEMPTS ]]; then log_error "Max failed attempts reached. Running emergency repair..." emergency_repair else repair_backup_system fi return 1 else log_info "✅ All health checks passed!" reset_failed_attempts return 0 fi } # Show usage usage() { cat << EOF OpenClaw Backup Health Check & Auto-Repair Usage: $0 [OPTIONS] Options: -h, --help Show this help message -c, --check Run health check only (no repair) -r, --repair Run repair only -e, --emergency Run emergency repair -v, --verbose Enable verbose output Examples: $0 Run full health check with auto-repair $0 --check Run health check only $0 --repair Run repair only $0 --emergency Run emergency repair EOF } # Main main() { # Create log directory mkdir -p "$BACKUP_LOG_DIR" case "${1:-}" in -h|--help) usage exit 0 ;; -c|--check) load_env check_commands && check_env_vars && check_hf_connectivity && \ check_dataset_access && check_backup_scripts && \ check_recent_backups && check_disk_space exit $? ;; -r|--repair) load_env repair_backup_system exit $? ;; -e|--emergency) load_env emergency_repair exit $? ;; -v|--verbose) export OPENCLAW_DEBUG=true run_health_check exit $? ;; *) run_health_check exit $? ;; esac } main "$@"