action / scripts /openclaw-backup-health.sh
GGSheng's picture
feat: deploy Gemma 4 to hf space
020c337 verified
#!/usr/bin/env bash
#
# OpenClaw Backup Health Check & Auto-Repair Script
# 健康检查与自动修复脚本
#
set -euo pipefail
# Configuration
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BACKUP_ENV_FILE_PATH="${OPENCLAW_BACKUP_ENV_FILE_PATH:-/root/.env.d/openclaw-backup.env}"
BACKUP_LOG_DIR="${OPENCLAW_BACKUP_LOG_DIR:-/var/log/openclaw}"
BACKUP_LOG_FILE="${BACKUP_LOG_DIR}/backup.log"
HEALTH_LOG_FILE="${BACKUP_LOG_DIR}/health-check.log"
MAX_BACKUP_AGE_MINUTES="${OPENCLAW_MAX_BACKUP_AGE_MINUTES:-30}"
MAX_FAILED_ATTEMPTS="${OPENCLAW_MAX_FAILED_ATTEMPTS:-3}"
FAILED_ATTEMPTS_FILE="${BACKUP_LOG_DIR}/.failed-attempts"
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Logging functions
log_info() {
echo -e "${GREEN}[INFO]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" | tee -a "$HEALTH_LOG_FILE"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" | tee -a "$HEALTH_LOG_FILE"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" | tee -a "$HEALTH_LOG_FILE"
}
log_debug() {
if [[ "${OPENCLAW_DEBUG:-false}" == "true" ]]; then
echo -e "${BLUE}[DEBUG]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" | tee -a "$HEALTH_LOG_FILE"
fi
}
# Load environment
load_env() {
if [[ -f "$BACKUP_ENV_FILE_PATH" ]]; then
# shellcheck disable=SC1090
source "$BACKUP_ENV_FILE_PATH"
log_debug "Loaded environment from $BACKUP_ENV_FILE_PATH"
else
log_warn "Environment file not found: $BACKUP_ENV_FILE_PATH"
fi
}
# Check if required commands exist
check_commands() {
local missing=()
for cmd in python3 curl; do
if ! command -v "$cmd" &> /dev/null; then
missing+=("$cmd")
fi
done
if [[ ${#missing[@]} -gt 0 ]]; then
log_error "Missing required commands: ${missing[*]}"
return 1
fi
log_debug "All required commands available"
return 0
}
# Check environment variables
check_env_vars() {
local errors=0
# Required variables
if [[ -z "${HF_TOKEN:-}" ]]; then
log_error "HF_TOKEN is not set"
((errors++))
else
log_debug "HF_TOKEN is set"
fi
if [[ -z "${OPENCLAW_BACKUP_DATASET_REPO:-}" ]]; then
log_error "OPENCLAW_BACKUP_DATASET_REPO is not set"
((errors++))
else
log_debug "Dataset repo: $OPENCLAW_BACKUP_DATASET_REPO"
fi
# Optional variables with defaults
if [[ -z "${OPENCLAW_BACKUP_CRON:-}" ]]; then
log_warn "OPENCLAW_BACKUP_CRON not set, using default: */10 * * * *"
fi
return $errors
}
# Check HuggingFace API connectivity
check_hf_connectivity() {
log_info "Checking HuggingFace connectivity..."
if [[ "${OPENCLAW_BACKUP_HEALTH_CHECK_ENABLED:-false}" == "false" ]]; then
log_info "Skipping HF connectivity check (OPENCLAW_BACKUP_HEALTH_CHECK_ENABLED=false, network checks disabled)"
return 0
fi
local max_retries=3
local retry_count=0
local curl_opts="-sSf"
while [[ $retry_count -lt $max_retries ]]; do
local http_code retry_after wait_time
local response_headers
response_headers=$(curl $curl_opts -D- \
--connect-timeout 10 \
--max-time 30 \
"https://huggingface.co/api/whoami" \
-H "Authorization: Bearer ${HF_TOKEN:-}" \
-o /dev/null 2>&1) || true
http_code=$(echo "$response_headers" | tail -1)
retry_after=$(echo "$response_headers" | grep -i "retry-after:" | awk '{print $2}' | tr -d '\r')
if [[ "$http_code" == "200" ]]; then
log_info "✅ HuggingFace API is accessible"
return 0
elif [[ "$http_code" == "401" ]]; then
log_warn "HF API returned 401 (Unauthorized) - token may be invalid"
return 1
elif [[ "$http_code" == "429" ]]; then
wait_time=60
if [[ -n "$retry_after" ]] && [[ "$retry_after" =~ ^[0-9]+$ ]]; then
wait_time=$retry_after
log_warn "HF API returned 429 (Rate Limited) - Retry-After: ${wait_time}s"
else
wait_time=$((60 * (2 ** retry_count)))
log_warn "HF API returned 429 (Rate Limited) - using exponential backoff: ${wait_time}s"
fi
if [[ $retry_count -lt $((max_retries - 1)) ]]; then
log_info "Waiting ${wait_time} seconds before retry $((retry_count + 2))/${max_retries}..."
sleep "$wait_time"
fi
((retry_count++))
continue
fi
((retry_count++))
log_warn "HuggingFace API check failed (attempt $retry_count/$max_retries, code: $http_code)"
sleep 5
done
log_error "❌ Cannot connect to HuggingFace API"
log_warn "If this is a network issue, you can set OPENCLAW_BACKUP_HEALTH_CHECK_ENABLED=false to skip"
return 1
}
# Check dataset access
check_dataset_access() {
local dataset="${OPENCLAW_BACKUP_DATASET_REPO:-}"
if [[ -z "$dataset" ]]; then
log_error "Cannot check dataset: OPENCLAW_BACKUP_DATASET_REPO not set"
return 1
fi
log_info "Checking dataset access: $dataset"
local result
result=$(python3 << PYEOF 2>&1
import sys
import os
sys.path.insert(0, '/opt/openclaw-hf')
try:
from huggingface_hub import HfApi
api = HfApi(token=os.getenv('HF_TOKEN'))
# Try to get dataset info
info = api.dataset_info("$dataset")
print(f"SUCCESS: Dataset exists, id={info.id}")
# Try to list files
files = list(api.list_repo_files("$dataset", repo_type='dataset'))
backup_files = [f for f in files if 'openclaw-backup' in f]
print(f"INFO: Found {len(backup_files)} backup files")
except Exception as e:
print(f"ERROR: {e}")
sys.exit(1)
PYEOF
)
if [[ "$result" == SUCCESS:* ]]; then
log_info "✅ Dataset access OK: ${result#SUCCESS: }"
return 0
else
log_error "❌ Dataset access failed: $result"
return 1
fi
}
# Check backup script integrity
check_backup_scripts() {
local scripts=(
"/usr/local/bin/openclaw-backup-cron.sh"
"/usr/local/bin/openclaw-restore.sh"
"/opt/openclaw-hf/openclaw_hf/backup.py"
)
local errors=0
for script in "${scripts[@]}"; do
if [[ ! -f "$script" ]]; then
log_error "❌ Script not found: $script"
((errors++))
elif [[ ! -x "$script" ]] && [[ "$script" != *.py ]]; then
log_warn "⚠️ Script not executable: $script"
chmod +x "$script" 2>/dev/null || {
log_error "❌ Cannot make $script executable"
((errors++))
}
else
log_debug "✅ Script OK: $script"
fi
done
return $errors
}
# Check backup log for recent failures
check_recent_backups() {
log_info "Checking recent backup status..."
if [[ ! -f "$BACKUP_LOG_FILE" ]]; then
log_warn "⚠️ Backup log not found: $BACKUP_LOG_FILE"
return 1
fi
# Check for recent successful backup
# Match actual success indicators, not error messages containing "backup complete"
local last_success
last_success=$(grep -iE "(backup uploaded|backup complete success)" "$BACKUP_LOG_FILE" 2>/dev/null | tail -1)
if [[ -z "$last_success" ]]; then
log_warn "⚠️ No successful backup found in logs"
return 1
fi
# Extract timestamp and calculate age
# Supports formats: [2026-04-24T04:00:02], 2026-04-24T04:00:02, 2026-04-24 04:00:02
local log_time
log_time=$(echo "$last_success" | grep -oE '[0-9]{4}-[0-9]{2}-[0-9]{2}[T ][0-9]{2}:[0-9]{2}:[0-9]{2}' | head -1)
if [[ -n "$log_time" ]]; then
local log_epoch now_epoch age_minutes
# Normalize space separator to T for consistent parsing
log_time=${log_time/ /T}
log_epoch=$(date -d "$log_time" +%s 2>/dev/null)
now_epoch=$(date +%s)
age_minutes=$(( (now_epoch - log_epoch) / 60 ))
if [[ $age_minutes -gt $MAX_BACKUP_AGE_MINUTES ]]; then
log_warn "⚠️ Last successful backup was $age_minutes minutes ago (max: $MAX_BACKUP_AGE_MINUTES)"
return 1
else
log_info "✅ Last backup was $age_minutes minutes ago"
fi
fi
# Check for recent errors
local recent_errors
recent_errors=$(grep -i "error\|failed\|exception" "$BACKUP_LOG_FILE" 2>/dev/null | tail -5)
if [[ -n "$recent_errors" ]]; then
log_warn "⚠️ Recent errors found in backup log:"
echo "$recent_errors" | while read -r line; do
log_warn " $line"
done
fi
return 0
}
# Check disk space
check_disk_space() {
log_info "Checking disk space..."
local usage
usage=$(df -h /root | awk 'NR==2 {print $5}' | tr -d '%')
if [[ "$usage" -gt 90 ]]; then
log_error "❌ Disk usage critical: ${usage}%"
return 1
elif [[ "$usage" -gt 80 ]]; then
log_warn "⚠️ Disk usage high: ${usage}%"
else
log_info "✅ Disk usage OK: ${usage}%"
fi
return 0
}
# Get failed attempts count
get_failed_attempts() {
if [[ -f "$FAILED_ATTEMPTS_FILE" ]]; then
cat "$FAILED_ATTEMPTS_FILE"
else
echo "0"
fi
}
# Increment failed attempts
increment_failed_attempts() {
local current
current=$(get_failed_attempts)
echo $((current + 1)) > "$FAILED_ATTEMPTS_FILE"
}
# Reset failed attempts
reset_failed_attempts() {
echo "0" > "$FAILED_ATTEMPTS_FILE"
}
# Check if backup or restore is currently running
check_backup_running() {
local my_pid=$$
local my_ppid=$PPID
local processes
processes=$(pgrep -af "backup\.py|openclaw-backup-cron|openclaw-restore" 2>/dev/null || true)
while IFS= read -r line; do
[[ -z "$line" ]] && continue
[[ "$line" == *"$my_pid"* ]] && continue
[[ "$line" == *"$my_ppid"* ]] && continue
local pid
pid=$(echo "$line" | awk '{print $1}')
[[ "$pid" == "$my_pid" ]] || [[ "$pid" == "$my_ppid" ]] && continue
if [[ "$line" =~ backup\.py ]]; then
local args
args=$(echo "$line" | sed 's/.*backup\.py//' | tr -s ' ')
if [[ "$args" =~ (^|[\s/])(backup|restore)($|[\s/]) ]] || [[ "$args" =~ \-\-command= ]]; then
log_info "Backup/restore is currently running: $line"
return 0
fi
fi
if [[ "$line" =~ openclaw-backup-cron ]] && [[ "$line" != *watchdog* ]]; then
log_info "Backup cron is currently running: $line"
return 0
fi
if [[ "$line" =~ openclaw-restore ]]; then
log_info "Restore is currently running: $line"
return 0
fi
done <<< "$processes"
return 1
}
# Repair function: Fix common issues
repair_backup_system() {
log_info "🔧 Attempting to repair backup system..."
# Check if backup/restore is running before doing any repairs
if check_backup_running; then
log_warn "⚠️ Backup or restore is currently running, skipping repair to avoid interference"
return 1
fi
local repairs_made=0
# Repair 1: Fix script permissions
log_info "Repair 1: Fixing script permissions..."
chmod +x /usr/local/bin/openclaw-*.sh 2>/dev/null && {
log_info "✅ Fixed script permissions"
((repairs_made++))
}
# Repair 2: Recreate backup log directory
if [[ ! -d "$BACKUP_LOG_DIR" ]]; then
log_info "Repair 2: Creating backup log directory..."
mkdir -p "$BACKUP_LOG_DIR"
touch "$BACKUP_LOG_FILE"
log_info "✅ Created backup log directory"
((repairs_made++))
fi
# Repair 3: Fix log file permissions
if [[ -f "$BACKUP_LOG_FILE" ]]; then
chmod 644 "$BACKUP_LOG_FILE"
fi
# Repair 4: Clear Python cache (might fix import issues)
log_info "Repair 4: Clearing Python cache..."
find /opt/openclaw-hf -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
log_info "✅ Cleared Python cache"
((repairs_made++))
# Repair 5: Test backup script manually (dry-run only to avoid polluting remote backups)
log_info "Repair 5: Testing backup script (dry-run)..."
if check_backup_running; then
log_warn "⚠️ Backup started while repair was in progress, skipping backup test"
else
local test_work_dir
test_work_dir=$(mktemp -d /tmp/openclaw-backup-test-XXXXXX)
if timeout 60 bash -c "
source '$BACKUP_ENV_FILE_PATH' 2>/dev/null || true
source /etc/profile.d/openclaw-env.sh 2>/dev/null || true
export HF_TOKEN
export OPENCLAW_BACKUP_DATASET_REPO='__health_check_test__invalid_repo__'
export OPENCLAW_BACKUP_WORK_DIR='$test_work_dir'
export OPENCLAW_BACKUP_KEEP_COUNT='1'
python3 /opt/openclaw-hf/openclaw_hf/backup.py backup 2>&1
" >> "$BACKUP_LOG_FILE" 2>&1; then
log_info "✅ Backup dry-run test successful"
reset_failed_attempts
((repairs_made++))
else
local test_exit_code=$?
if [[ -n $(find "$test_work_dir" -name 'openclaw-backup-*.tar.gz' -print -quit 2>/dev/null) ]]; then
log_info "✅ Backup dry-run test successful (archive created locally)"
reset_failed_attempts
((repairs_made++))
else
log_error "❌ Backup dry-run test failed (exit code: $test_exit_code)"
increment_failed_attempts
fi
fi
rm -rf "$test_work_dir" 2>/dev/null || true
fi
log_info "🔧 Repair complete. Repairs made: $repairs_made"
return $((repairs_made == 0 ? 1 : 0))
}
# Emergency repair: Reset everything
emergency_repair() {
log_warn "🚨 EMERGENCY REPAIR MODE"
# Clear all caches
rm -rf /root/.cache/huggingface 2>/dev/null || true
# Reinstall huggingface_hub if needed
python3 -m pip install --upgrade --force-reinstall "huggingface_hub[cli]>=0.31.1" 2>/dev/null || {
log_error "Failed to reinstall huggingface_hub"
return 1
}
# Reset failed attempts
reset_failed_attempts
log_info "🚨 Emergency repair complete"
return 0
}
# Main health check
run_health_check() {
log_info "========================================"
log_info "OpenClaw Backup Health Check Starting"
log_info "========================================"
load_env
local total_checks=0
local passed_checks=0
local failed_checks=0
# Check 1: Commands
((total_checks++))
if check_commands; then
((passed_checks++))
else
((failed_checks++))
fi
# Check 2: Environment variables
((total_checks++))
if check_env_vars; then
((passed_checks++))
else
((failed_checks++))
fi
# Check 3: HF Connectivity
((total_checks++))
if check_hf_connectivity; then
((passed_checks++))
else
((failed_checks++))
fi
# Check 4: Dataset access
((total_checks++))
if check_dataset_access; then
((passed_checks++))
else
((failed_checks++))
fi
# Check 5: Scripts
((total_checks++))
if check_backup_scripts; then
((passed_checks++))
else
((failed_checks++))
fi
# Check 6: Recent backups
((total_checks++))
if check_recent_backups; then
((passed_checks++))
else
((failed_checks++))
fi
# Check 7: Disk space
((total_checks++))
if check_disk_space; then
((passed_checks++))
else
((failed_checks++))
fi
# Summary
log_info "========================================"
log_info "Health Check Summary"
log_info "========================================"
log_info "Total checks: $total_checks"
log_info "✅ Passed: $passed_checks"
log_info "❌ Failed: $failed_checks"
# Auto-repair if needed
local failed_attempts
failed_attempts=$(get_failed_attempts)
if [[ $failed_checks -gt 0 ]]; then
log_warn "Some checks failed. Failed attempts: $failed_attempts/$MAX_FAILED_ATTEMPTS"
if [[ $failed_attempts -ge $MAX_FAILED_ATTEMPTS ]]; then
log_error "Max failed attempts reached. Running emergency repair..."
emergency_repair
else
repair_backup_system
fi
return 1
else
log_info "✅ All health checks passed!"
reset_failed_attempts
return 0
fi
}
# Show usage
usage() {
cat << EOF
OpenClaw Backup Health Check & Auto-Repair
Usage: $0 [OPTIONS]
Options:
-h, --help Show this help message
-c, --check Run health check only (no repair)
-r, --repair Run repair only
-e, --emergency Run emergency repair
-v, --verbose Enable verbose output
Examples:
$0 Run full health check with auto-repair
$0 --check Run health check only
$0 --repair Run repair only
$0 --emergency Run emergency repair
EOF
}
# Main
main() {
# Create log directory
mkdir -p "$BACKUP_LOG_DIR"
case "${1:-}" in
-h|--help)
usage
exit 0
;;
-c|--check)
load_env
check_commands && check_env_vars && check_hf_connectivity && \
check_dataset_access && check_backup_scripts && \
check_recent_backups && check_disk_space
exit $?
;;
-r|--repair)
load_env
repair_backup_system
exit $?
;;
-e|--emergency)
load_env
emergency_repair
exit $?
;;
-v|--verbose)
export OPENCLAW_DEBUG=true
run_health_check
exit $?
;;
*)
run_health_check
exit $?
;;
esac
}
main "$@"