| #!/usr/bin/env bash |
| |
| |
| |
| |
|
|
| set -euo pipefail |
|
|
| |
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| BACKUP_ENV_FILE_PATH="${OPENCLAW_BACKUP_ENV_FILE_PATH:-/root/.env.d/openclaw-backup.env}" |
| BACKUP_LOG_DIR="${OPENCLAW_BACKUP_LOG_DIR:-/var/log/openclaw}" |
| BACKUP_LOG_FILE="${BACKUP_LOG_DIR}/backup.log" |
| HEALTH_LOG_FILE="${BACKUP_LOG_DIR}/health-check.log" |
| MAX_BACKUP_AGE_MINUTES="${OPENCLAW_MAX_BACKUP_AGE_MINUTES:-20}" |
| MAX_FAILED_ATTEMPTS="${OPENCLAW_MAX_FAILED_ATTEMPTS:-3}" |
| FAILED_ATTEMPTS_FILE="${BACKUP_LOG_DIR}/.failed-attempts" |
|
|
| |
| RED='\033[0;31m' |
| GREEN='\033[0;32m' |
| YELLOW='\033[1;33m' |
| BLUE='\033[0;34m' |
| NC='\033[0m' |
|
|
| |
| log_info() { |
| echo -e "${GREEN}[INFO]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" | tee -a "$HEALTH_LOG_FILE" |
| } |
|
|
| log_warn() { |
| echo -e "${YELLOW}[WARN]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" | tee -a "$HEALTH_LOG_FILE" |
| } |
|
|
| log_error() { |
| echo -e "${RED}[ERROR]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" | tee -a "$HEALTH_LOG_FILE" |
| } |
|
|
| log_debug() { |
| if [[ "${OPENCLAW_DEBUG:-false}" == "true" ]]; then |
| echo -e "${BLUE}[DEBUG]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" | tee -a "$HEALTH_LOG_FILE" |
| fi |
| } |
|
|
| |
| load_env() { |
| if [[ -f "$BACKUP_ENV_FILE_PATH" ]]; then |
| |
| source "$BACKUP_ENV_FILE_PATH" |
| log_debug "Loaded environment from $BACKUP_ENV_FILE_PATH" |
| else |
| log_warn "Environment file not found: $BACKUP_ENV_FILE_PATH" |
| fi |
| } |
|
|
| |
| check_commands() { |
| local missing=() |
| |
| for cmd in python3 curl; do |
| if ! command -v "$cmd" &> /dev/null; then |
| missing+=("$cmd") |
| fi |
| done |
| |
| if [[ ${#missing[@]} -gt 0 ]]; then |
| log_error "Missing required commands: ${missing[*]}" |
| return 1 |
| fi |
| |
| log_debug "All required commands available" |
| return 0 |
| } |
|
|
| |
| check_env_vars() { |
| local errors=0 |
| |
| |
| if [[ -z "${HF_TOKEN:-}" ]]; then |
| log_error "HF_TOKEN is not set" |
| ((errors++)) |
| else |
| log_debug "HF_TOKEN is set" |
| fi |
| |
| if [[ -z "${OPENCLAW_BACKUP_DATASET_REPO:-}" ]]; then |
| log_error "OPENCLAW_BACKUP_DATASET_REPO is not set" |
| ((errors++)) |
| else |
| log_debug "Dataset repo: $OPENCLAW_BACKUP_DATASET_REPO" |
| fi |
| |
| |
| if [[ -z "${OPENCLAW_BACKUP_CRON:-}" ]]; then |
| log_warn "OPENCLAW_BACKUP_CRON not set, using default: */10 * * * *" |
| fi |
| |
| return $errors |
| } |
|
|
| |
| check_hf_connectivity() { |
| log_info "Checking HuggingFace connectivity..." |
|
|
| if [[ "${OPENCLAW_BACKUP_HEALTH_CHECK_ENABLED:-false}" == "false" ]]; then |
| log_info "Skipping HF connectivity check (OPENCLAW_BACKUP_HEALTH_CHECK_ENABLED=false, network checks disabled)" |
| return 0 |
| fi |
|
|
| local max_retries=3 |
| local retry_count=0 |
| local curl_opts="-sSf" |
|
|
| while [[ $retry_count -lt $max_retries ]]; do |
| local http_code retry_after wait_time |
| local response_headers |
| response_headers=$(curl $curl_opts -D- \ |
| --connect-timeout 10 \ |
| --max-time 30 \ |
| "https://huggingface.co/api/whoami" \ |
| -H "Authorization: Bearer ${HF_TOKEN:-}" \ |
| -o /dev/null 2>&1) || true |
| http_code=$(echo "$response_headers" | tail -1) |
| retry_after=$(echo "$response_headers" | grep -i "retry-after:" | awk '{print $2}' | tr -d '\r') |
|
|
| if [[ "$http_code" == "200" ]]; then |
| log_info "✅ HuggingFace API is accessible" |
| return 0 |
| elif [[ "$http_code" == "401" ]]; then |
| log_warn "HF API returned 401 (Unauthorized) - token may be invalid" |
| return 1 |
| elif [[ "$http_code" == "429" ]]; then |
| wait_time=60 |
| if [[ -n "$retry_after" ]] && [[ "$retry_after" =~ ^[0-9]+$ ]]; then |
| wait_time=$retry_after |
| log_warn "HF API returned 429 (Rate Limited) - Retry-After: ${wait_time}s" |
| else |
| wait_time=$((60 * (2 ** retry_count))) |
| log_warn "HF API returned 429 (Rate Limited) - using exponential backoff: ${wait_time}s" |
| fi |
|
|
| if [[ $retry_count -lt $((max_retries - 1)) ]]; then |
| log_info "Waiting ${wait_time} seconds before retry $((retry_count + 2))/${max_retries}..." |
| sleep "$wait_time" |
| fi |
| ((retry_count++)) |
| continue |
| fi |
|
|
| ((retry_count++)) |
| log_warn "HuggingFace API check failed (attempt $retry_count/$max_retries, code: $http_code)" |
| sleep 5 |
| done |
|
|
| log_error "❌ Cannot connect to HuggingFace API" |
| log_warn "If this is a network issue, you can set OPENCLAW_BACKUP_HEALTH_CHECK_ENABLED=false to skip" |
| return 1 |
| } |
|
|
| |
| check_dataset_access() { |
| local dataset="${OPENCLAW_BACKUP_DATASET_REPO:-}" |
| |
| if [[ -z "$dataset" ]]; then |
| log_error "Cannot check dataset: OPENCLAW_BACKUP_DATASET_REPO not set" |
| return 1 |
| fi |
| |
| log_info "Checking dataset access: $dataset" |
| |
| local result |
| result=$(python3 << PYEOF 2>&1 |
| import sys |
| import os |
| sys.path.insert(0, '/opt/openclaw-hf') |
| |
| try: |
| from huggingface_hub import HfApi |
| api = HfApi(token=os.getenv('HF_TOKEN')) |
| |
| # Try to get dataset info |
| info = api.dataset_info("$dataset") |
| print(f"SUCCESS: Dataset exists, id={info.id}") |
| |
| # Try to list files |
| files = list(api.list_repo_files("$dataset", repo_type='dataset')) |
| backup_files = [f for f in files if 'openclaw-backup' in f] |
| print(f"INFO: Found {len(backup_files)} backup files") |
| |
| except Exception as e: |
| print(f"ERROR: {e}") |
| sys.exit(1) |
| PYEOF |
| ) |
| |
| if [[ "$result" == SUCCESS:* ]]; then |
| log_info "✅ Dataset access OK: ${result#SUCCESS: }" |
| return 0 |
| else |
| log_error "❌ Dataset access failed: $result" |
| return 1 |
| fi |
| } |
|
|
| |
| check_backup_scripts() { |
| local scripts=( |
| "/usr/local/bin/openclaw-backup-cron.sh" |
| "/usr/local/bin/openclaw-restore.sh" |
| "/opt/openclaw-hf/openclaw_hf/backup.py" |
| ) |
| |
| local errors=0 |
| |
| for script in "${scripts[@]}"; do |
| if [[ ! -f "$script" ]]; then |
| log_error "❌ Script not found: $script" |
| ((errors++)) |
| elif [[ ! -x "$script" ]] && [[ "$script" != *.py ]]; then |
| log_warn "⚠️ Script not executable: $script" |
| chmod +x "$script" 2>/dev/null || { |
| log_error "❌ Cannot make $script executable" |
| ((errors++)) |
| } |
| else |
| log_debug "✅ Script OK: $script" |
| fi |
| done |
| |
| return $errors |
| } |
|
|
| |
| check_recent_backups() { |
| log_info "Checking recent backup status..." |
| |
| if [[ ! -f "$BACKUP_LOG_FILE" ]]; then |
| log_warn "⚠️ Backup log not found: $BACKUP_LOG_FILE" |
| return 1 |
| fi |
| |
| |
| |
| local last_success |
| last_success=$(grep -iE "(backup uploaded|backup complete success)" "$BACKUP_LOG_FILE" 2>/dev/null | tail -1) |
| |
| if [[ -z "$last_success" ]]; then |
| log_warn "⚠️ No successful backup found in logs" |
| return 1 |
| fi |
| |
| |
| |
| local log_time |
| log_time=$(echo "$last_success" | grep -oE '[0-9]{4}-[0-9]{2}-[0-9]{2}[T ][0-9]{2}:[0-9]{2}:[0-9]{2}' | head -1) |
|
|
| if [[ -n "$log_time" ]]; then |
| local log_epoch now_epoch age_minutes |
| |
| log_time=${log_time/ /T} |
| log_epoch=$(date -d "$log_time" +%s 2>/dev/null) |
| now_epoch=$(date +%s) |
| age_minutes=$(( (now_epoch - log_epoch) / 60 )) |
| |
| if [[ $age_minutes -gt $MAX_BACKUP_AGE_MINUTES ]]; then |
| log_warn "⚠️ Last successful backup was $age_minutes minutes ago (max: $MAX_BACKUP_AGE_MINUTES)" |
| return 1 |
| else |
| log_info "✅ Last backup was $age_minutes minutes ago" |
| fi |
| fi |
| |
| |
| local recent_errors |
| recent_errors=$(grep -i "error\|failed\|exception" "$BACKUP_LOG_FILE" 2>/dev/null | tail -5) |
| |
| if [[ -n "$recent_errors" ]]; then |
| log_warn "⚠️ Recent errors found in backup log:" |
| echo "$recent_errors" | while read -r line; do |
| log_warn " $line" |
| done |
| fi |
| |
| return 0 |
| } |
|
|
| |
| check_disk_space() { |
| log_info "Checking disk space..." |
| |
| local usage |
| usage=$(df -h /root | awk 'NR==2 {print $5}' | tr -d '%') |
| |
| if [[ "$usage" -gt 90 ]]; then |
| log_error "❌ Disk usage critical: ${usage}%" |
| return 1 |
| elif [[ "$usage" -gt 80 ]]; then |
| log_warn "⚠️ Disk usage high: ${usage}%" |
| else |
| log_info "✅ Disk usage OK: ${usage}%" |
| fi |
| |
| return 0 |
| } |
|
|
| |
| get_failed_attempts() { |
| if [[ -f "$FAILED_ATTEMPTS_FILE" ]]; then |
| cat "$FAILED_ATTEMPTS_FILE" |
| else |
| echo "0" |
| fi |
| } |
|
|
| |
| increment_failed_attempts() { |
| local current |
| current=$(get_failed_attempts) |
| echo $((current + 1)) > "$FAILED_ATTEMPTS_FILE" |
| } |
|
|
| |
| reset_failed_attempts() { |
| echo "0" > "$FAILED_ATTEMPTS_FILE" |
| } |
|
|
| |
| check_backup_running() { |
| local my_pid=$$ |
| local my_ppid=$PPID |
|
|
| local processes |
| processes=$(pgrep -af "backup\.py|openclaw-backup-cron|openclaw-restore" 2>/dev/null || true) |
|
|
| while IFS= read -r line; do |
| [[ -z "$line" ]] && continue |
|
|
| [[ "$line" == *"$my_pid"* ]] && continue |
| [[ "$line" == *"$my_ppid"* ]] && continue |
|
|
| local pid |
| pid=$(echo "$line" | awk '{print $1}') |
| [[ "$pid" == "$my_pid" ]] || [[ "$pid" == "$my_ppid" ]] && continue |
|
|
| if [[ "$line" =~ backup\.py ]]; then |
| local args |
| args=$(echo "$line" | sed 's/.*backup\.py//' | tr -s ' ') |
| if [[ "$args" =~ (^|[\s/])(backup|restore)($|[\s/]) ]] || [[ "$args" =~ \-\-command= ]]; then |
| log_info "Backup/restore is currently running: $line" |
| return 0 |
| fi |
| fi |
|
|
| if [[ "$line" =~ openclaw-backup-cron ]] && [[ "$line" != *watchdog* ]]; then |
| log_info "Backup cron is currently running: $line" |
| return 0 |
| fi |
|
|
| if [[ "$line" =~ openclaw-restore ]]; then |
| log_info "Restore is currently running: $line" |
| return 0 |
| fi |
| done <<< "$processes" |
|
|
| return 1 |
| } |
|
|
| |
| repair_backup_system() { |
| log_info "🔧 Attempting to repair backup system..." |
| |
| |
| if check_backup_running; then |
| log_warn "⚠️ Backup or restore is currently running, skipping repair to avoid interference" |
| return 1 |
| fi |
| |
| local repairs_made=0 |
| |
| |
| log_info "Repair 1: Fixing script permissions..." |
| chmod +x /usr/local/bin/openclaw-*.sh 2>/dev/null && { |
| log_info "✅ Fixed script permissions" |
| ((repairs_made++)) |
| } |
| |
| |
| if [[ ! -d "$BACKUP_LOG_DIR" ]]; then |
| log_info "Repair 2: Creating backup log directory..." |
| mkdir -p "$BACKUP_LOG_DIR" |
| touch "$BACKUP_LOG_FILE" |
| log_info "✅ Created backup log directory" |
| ((repairs_made++)) |
| fi |
| |
| |
| if [[ -f "$BACKUP_LOG_FILE" ]]; then |
| chmod 644 "$BACKUP_LOG_FILE" |
| fi |
| |
| |
| log_info "Repair 4: Clearing Python cache..." |
| find /opt/openclaw-hf -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true |
| log_info "✅ Cleared Python cache" |
| ((repairs_made++)) |
| |
| |
| log_info "Repair 5: Testing backup script (dry-run)..." |
| if check_backup_running; then |
| log_warn "⚠️ Backup started while repair was in progress, skipping backup test" |
| else |
| local test_work_dir |
| test_work_dir=$(mktemp -d /tmp/openclaw-backup-test-XXXXXX) |
| if timeout 60 bash -c " |
| source '$BACKUP_ENV_FILE_PATH' 2>/dev/null || true |
| source /etc/profile.d/openclaw-env.sh 2>/dev/null || true |
| export HF_TOKEN |
| export OPENCLAW_BACKUP_DATASET_REPO='__health_check_test__invalid_repo__' |
| export OPENCLAW_BACKUP_WORK_DIR='$test_work_dir' |
| export OPENCLAW_BACKUP_KEEP_COUNT='1' |
| python3 /opt/openclaw-hf/openclaw_hf/backup.py backup 2>&1 |
| " >> "$BACKUP_LOG_FILE" 2>&1; then |
| log_info "✅ Backup dry-run test successful" |
| reset_failed_attempts |
| ((repairs_made++)) |
| else |
| local test_exit_code=$? |
| if [[ -n $(find "$test_work_dir" -name 'openclaw-backup-*.tar.gz' -print -quit 2>/dev/null) ]]; then |
| log_info "✅ Backup dry-run test successful (archive created locally)" |
| reset_failed_attempts |
| ((repairs_made++)) |
| else |
| log_error "❌ Backup dry-run test failed (exit code: $test_exit_code)" |
| increment_failed_attempts |
| fi |
| fi |
| rm -rf "$test_work_dir" 2>/dev/null || true |
| fi |
| |
| log_info "🔧 Repair complete. Repairs made: $repairs_made" |
| return $((repairs_made == 0 ? 1 : 0)) |
| } |
|
|
| |
| emergency_repair() { |
| log_warn "🚨 EMERGENCY REPAIR MODE" |
| |
| |
| rm -rf /root/.cache/huggingface 2>/dev/null || true |
| |
| |
| python3 -m pip install --upgrade --force-reinstall "huggingface_hub[cli]>=0.31.1" 2>/dev/null || { |
| log_error "Failed to reinstall huggingface_hub" |
| return 1 |
| } |
| |
| |
| reset_failed_attempts |
| |
| log_info "🚨 Emergency repair complete" |
| return 0 |
| } |
|
|
| |
| run_health_check() { |
| log_info "========================================" |
| log_info "OpenClaw Backup Health Check Starting" |
| log_info "========================================" |
| |
| load_env |
| |
| local total_checks=0 |
| local passed_checks=0 |
| local failed_checks=0 |
| |
| |
| ((total_checks++)) |
| if check_commands; then |
| ((passed_checks++)) |
| else |
| ((failed_checks++)) |
| fi |
| |
| |
| ((total_checks++)) |
| if check_env_vars; then |
| ((passed_checks++)) |
| else |
| ((failed_checks++)) |
| fi |
| |
| |
| ((total_checks++)) |
| if check_hf_connectivity; then |
| ((passed_checks++)) |
| else |
| ((failed_checks++)) |
| fi |
| |
| |
| ((total_checks++)) |
| if check_dataset_access; then |
| ((passed_checks++)) |
| else |
| ((failed_checks++)) |
| fi |
| |
| |
| ((total_checks++)) |
| if check_backup_scripts; then |
| ((passed_checks++)) |
| else |
| ((failed_checks++)) |
| fi |
| |
| |
| ((total_checks++)) |
| if check_recent_backups; then |
| ((passed_checks++)) |
| else |
| ((failed_checks++)) |
| fi |
| |
| |
| ((total_checks++)) |
| if check_disk_space; then |
| ((passed_checks++)) |
| else |
| ((failed_checks++)) |
| fi |
| |
| |
| log_info "========================================" |
| log_info "Health Check Summary" |
| log_info "========================================" |
| log_info "Total checks: $total_checks" |
| log_info "✅ Passed: $passed_checks" |
| log_info "❌ Failed: $failed_checks" |
| |
| |
| local failed_attempts |
| failed_attempts=$(get_failed_attempts) |
| |
| if [[ $failed_checks -gt 0 ]]; then |
| log_warn "Some checks failed. Failed attempts: $failed_attempts/$MAX_FAILED_ATTEMPTS" |
| |
| if [[ $failed_attempts -ge $MAX_FAILED_ATTEMPTS ]]; then |
| log_error "Max failed attempts reached. Running emergency repair..." |
| emergency_repair |
| else |
| repair_backup_system |
| fi |
| |
| return 1 |
| else |
| log_info "✅ All health checks passed!" |
| reset_failed_attempts |
| return 0 |
| fi |
| } |
|
|
| |
| usage() { |
| cat << EOF |
| OpenClaw Backup Health Check & Auto-Repair |
| |
| Usage: $0 [OPTIONS] |
| |
| Options: |
| -h, --help Show this help message |
| -c, --check Run health check only (no repair) |
| -r, --repair Run repair only |
| -e, --emergency Run emergency repair |
| -v, --verbose Enable verbose output |
| |
| Examples: |
| $0 Run full health check with auto-repair |
| $0 --check Run health check only |
| $0 --repair Run repair only |
| $0 --emergency Run emergency repair |
| EOF |
| } |
|
|
| |
| main() { |
| |
| mkdir -p "$BACKUP_LOG_DIR" |
| |
| case "${1:-}" in |
| -h|--help) |
| usage |
| exit 0 |
| ;; |
| -c|--check) |
| load_env |
| check_commands && check_env_vars && check_hf_connectivity && \ |
| check_dataset_access && check_backup_scripts && \ |
| check_recent_backups && check_disk_space |
| exit $? |
| ;; |
| -r|--repair) |
| load_env |
| repair_backup_system |
| exit $? |
| ;; |
| -e|--emergency) |
| load_env |
| emergency_repair |
| exit $? |
| ;; |
| -v|--verbose) |
| export OPENCLAW_DEBUG=true |
| run_health_check |
| exit $? |
| ;; |
| *) |
| run_health_check |
| exit $? |
| ;; |
| esac |
| } |
|
|
| main "$@" |
|
|