Spaces:

GGSheng
/

action

Running

App Files Files Community

action / scripts /openclaw-backup-health.sh

GGSheng

feat: deploy Gemma 4 to hf space

020c337 verified about 15 hours ago

raw

history blame contribute delete

18.4 kB

	#!/usr/bin/env bash
	#
	# OpenClaw Backup Health Check & Auto-Repair Script
	# 健康检查与自动修复脚本
	#

	set -euo pipefail

	# Configuration
	SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
	BACKUP_ENV_FILE_PATH="${OPENCLAW_BACKUP_ENV_FILE_PATH:-/root/.env.d/openclaw-backup.env}"
	BACKUP_LOG_DIR="${OPENCLAW_BACKUP_LOG_DIR:-/var/log/openclaw}"
	BACKUP_LOG_FILE="${BACKUP_LOG_DIR}/backup.log"
	HEALTH_LOG_FILE="${BACKUP_LOG_DIR}/health-check.log"
	MAX_BACKUP_AGE_MINUTES="${OPENCLAW_MAX_BACKUP_AGE_MINUTES:-30}"
	MAX_FAILED_ATTEMPTS="${OPENCLAW_MAX_FAILED_ATTEMPTS:-3}"
	FAILED_ATTEMPTS_FILE="${BACKUP_LOG_DIR}/.failed-attempts"

	# Colors for output
	RED='\033[0;31m'
	GREEN='\033[0;32m'
	YELLOW='\033[1;33m'
	BLUE='\033[0;34m'
	NC='\033[0m' # No Color

	# Logging functions
	log_info() {
	echo -e "${GREEN}[INFO]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" \| tee -a "$HEALTH_LOG_FILE"
	}

	log_warn() {
	echo -e "${YELLOW}[WARN]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" \| tee -a "$HEALTH_LOG_FILE"
	}

	log_error() {
	echo -e "${RED}[ERROR]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" \| tee -a "$HEALTH_LOG_FILE"
	}

	log_debug() {
	if [[ "${OPENCLAW_DEBUG:-false}" == "true" ]]; then
	echo -e "${BLUE}[DEBUG]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" \| tee -a "$HEALTH_LOG_FILE"
	fi
	}

	# Load environment
	load_env() {
	if [[ -f "$BACKUP_ENV_FILE_PATH" ]]; then
	# shellcheck disable=SC1090
	source "$BACKUP_ENV_FILE_PATH"
	log_debug "Loaded environment from $BACKUP_ENV_FILE_PATH"
	else
	log_warn "Environment file not found: $BACKUP_ENV_FILE_PATH"
	fi
	}

	# Check if required commands exist
	check_commands() {
	local missing=()

	for cmd in python3 curl; do
	if ! command -v "$cmd" &> /dev/null; then
	missing+=("$cmd")
	fi
	done

	if [[ ${#missing[@]} -gt 0 ]]; then
	log_error "Missing required commands: ${missing[*]}"
	return 1
	fi

	log_debug "All required commands available"
	return 0
	}

	# Check environment variables
	check_env_vars() {
	local errors=0

	# Required variables
	if [[ -z "${HF_TOKEN:-}" ]]; then
	log_error "HF_TOKEN is not set"
	((errors++))
	else
	log_debug "HF_TOKEN is set"
	fi

	if [[ -z "${OPENCLAW_BACKUP_DATASET_REPO:-}" ]]; then
	log_error "OPENCLAW_BACKUP_DATASET_REPO is not set"
	((errors++))
	else
	log_debug "Dataset repo: $OPENCLAW_BACKUP_DATASET_REPO"
	fi

	# Optional variables with defaults
	if [[ -z "${OPENCLAW_BACKUP_CRON:-}" ]]; then
	log_warn "OPENCLAW_BACKUP_CRON not set, using default: /10 * * *"
	fi

	return $errors
	}

	# Check HuggingFace API connectivity
	check_hf_connectivity() {
	log_info "Checking HuggingFace connectivity..."

	if [[ "${OPENCLAW_BACKUP_HEALTH_CHECK_ENABLED:-false}" == "false" ]]; then
	log_info "Skipping HF connectivity check (OPENCLAW_BACKUP_HEALTH_CHECK_ENABLED=false, network checks disabled)"
	return 0
	fi

	local max_retries=3
	local retry_count=0
	local curl_opts="-sSf"

	while [[ $retry_count -lt $max_retries ]]; do
	local http_code retry_after wait_time
	local response_headers
	response_headers=$(curl $curl_opts -D- \
	--connect-timeout 10 \
	--max-time 30 \
	"https://huggingface.co/api/whoami" \
	-H "Authorization: Bearer ${HF_TOKEN:-}" \
	-o /dev/null 2>&1) \|\| true
	http_code=$(echo "$response_headers" \| tail -1)
	retry_after=$(echo "$response_headers" \| grep -i "retry-after:" \| awk '{print $2}' \| tr -d '\r')

	if [[ "$http_code" == "200" ]]; then
	log_info "✅ HuggingFace API is accessible"
	return 0
	elif [[ "$http_code" == "401" ]]; then
	log_warn "HF API returned 401 (Unauthorized) - token may be invalid"
	return 1
	elif [[ "$http_code" == "429" ]]; then
	wait_time=60
	if [[ -n "$retry_after" ]] && [[ "$retry_after" =~ ^[0-9]+$ ]]; then
	wait_time=$retry_after
	log_warn "HF API returned 429 (Rate Limited) - Retry-After: ${wait_time}s"
	else
	wait_time=$((60 * (2 ** retry_count)))
	log_warn "HF API returned 429 (Rate Limited) - using exponential backoff: ${wait_time}s"
	fi

	if [[ $retry_count -lt $((max_retries - 1)) ]]; then
	log_info "Waiting ${wait_time} seconds before retry $((retry_count + 2))/${max_retries}..."
	sleep "$wait_time"
	fi
	((retry_count++))
	continue
	fi

	((retry_count++))
	log_warn "HuggingFace API check failed (attempt $retry_count/$max_retries, code: $http_code)"
	sleep 5
	done

	log_error "❌ Cannot connect to HuggingFace API"
	log_warn "If this is a network issue, you can set OPENCLAW_BACKUP_HEALTH_CHECK_ENABLED=false to skip"
	return 1
	}

	# Check dataset access
	check_dataset_access() {
	local dataset="${OPENCLAW_BACKUP_DATASET_REPO:-}"

	if [[ -z "$dataset" ]]; then
	log_error "Cannot check dataset: OPENCLAW_BACKUP_DATASET_REPO not set"
	return 1
	fi

	log_info "Checking dataset access: $dataset"

	local result
	result=$(python3 << PYEOF 2>&1
	import sys
	import os
	sys.path.insert(0, '/opt/openclaw-hf')

	try:
	from huggingface_hub import HfApi
	api = HfApi(token=os.getenv('HF_TOKEN'))

	# Try to get dataset info
	info = api.dataset_info("$dataset")
	print(f"SUCCESS: Dataset exists, id={info.id}")

	# Try to list files
	files = list(api.list_repo_files("$dataset", repo_type='dataset'))
	backup_files = [f for f in files if 'openclaw-backup' in f]
	print(f"INFO: Found {len(backup_files)} backup files")

	except Exception as e:
	print(f"ERROR: {e}")
	sys.exit(1)
	PYEOF
	)

	if [[ "$result" == SUCCESS:* ]]; then
	log_info "✅ Dataset access OK: ${result#SUCCESS: }"
	return 0
	else
	log_error "❌ Dataset access failed: $result"
	return 1
	fi
	}

	# Check backup script integrity
	check_backup_scripts() {
	local scripts=(
	"/usr/local/bin/openclaw-backup-cron.sh"
	"/usr/local/bin/openclaw-restore.sh"
	"/opt/openclaw-hf/openclaw_hf/backup.py"
	)

	local errors=0

	for script in "${scripts[@]}"; do
	if [[ ! -f "$script" ]]; then
	log_error "❌ Script not found: $script"
	((errors++))
	elif [[ ! -x "$script" ]] && [[ "$script" != *.py ]]; then
	log_warn "⚠️ Script not executable: $script"
	chmod +x "$script" 2>/dev/null \|\| {
	log_error "❌ Cannot make $script executable"
	((errors++))
	}
	else
	log_debug "✅ Script OK: $script"
	fi
	done

	return $errors
	}

	# Check backup log for recent failures
	check_recent_backups() {
	log_info "Checking recent backup status..."

	if [[ ! -f "$BACKUP_LOG_FILE" ]]; then
	log_warn "⚠️ Backup log not found: $BACKUP_LOG_FILE"
	return 1
	fi

	# Check for recent successful backup
	# Match actual success indicators, not error messages containing "backup complete"
	local last_success
	last_success=$(grep -iE "(backup uploaded\|backup complete success)" "$BACKUP_LOG_FILE" 2>/dev/null \| tail -1)

	if [[ -z "$last_success" ]]; then
	log_warn "⚠️ No successful backup found in logs"
	return 1
	fi

	# Extract timestamp and calculate age
	# Supports formats: [2026-04-24T04:00:02], 2026-04-24T04:00:02, 2026-04-24 04:00:02
	local log_time
	log_time=$(echo "$last_success" \| grep -oE '[0-9]{4}-[0-9]{2}-[0-9]{2}[T ][0-9]{2}:[0-9]{2}:[0-9]{2}' \| head -1)

	if [[ -n "$log_time" ]]; then
	local log_epoch now_epoch age_minutes
	# Normalize space separator to T for consistent parsing
	log_time=${log_time/ /T}
	log_epoch=$(date -d "$log_time" +%s 2>/dev/null)
	now_epoch=$(date +%s)
	age_minutes=$(( (now_epoch - log_epoch) / 60 ))

	if [[ $age_minutes -gt $MAX_BACKUP_AGE_MINUTES ]]; then
	log_warn "⚠️ Last successful backup was $age_minutes minutes ago (max: $MAX_BACKUP_AGE_MINUTES)"
	return 1
	else
	log_info "✅ Last backup was $age_minutes minutes ago"
	fi
	fi

	# Check for recent errors
	local recent_errors
	recent_errors=$(grep -i "error\\|failed\\|exception" "$BACKUP_LOG_FILE" 2>/dev/null \| tail -5)

	if [[ -n "$recent_errors" ]]; then
	log_warn "⚠️ Recent errors found in backup log:"
	echo "$recent_errors" \| while read -r line; do
	log_warn " $line"
	done
	fi

	return 0
	}

	# Check disk space
	check_disk_space() {
	log_info "Checking disk space..."

	local usage
	usage=$(df -h /root \| awk 'NR==2 {print $5}' \| tr -d '%')

	if [[ "$usage" -gt 90 ]]; then
	log_error "❌ Disk usage critical: ${usage}%"
	return 1
	elif [[ "$usage" -gt 80 ]]; then
	log_warn "⚠️ Disk usage high: ${usage}%"
	else
	log_info "✅ Disk usage OK: ${usage}%"
	fi

	return 0
	}

	# Get failed attempts count
	get_failed_attempts() {
	if [[ -f "$FAILED_ATTEMPTS_FILE" ]]; then
	cat "$FAILED_ATTEMPTS_FILE"
	else
	echo "0"
	fi
	}

	# Increment failed attempts
	increment_failed_attempts() {
	local current
	current=$(get_failed_attempts)
	echo $((current + 1)) > "$FAILED_ATTEMPTS_FILE"
	}

	# Reset failed attempts
	reset_failed_attempts() {
	echo "0" > "$FAILED_ATTEMPTS_FILE"
	}

	# Check if backup or restore is currently running
	check_backup_running() {
	local my_pid=$$
	local my_ppid=$PPID

	local processes
	processes=$(pgrep -af "backup\.py\|openclaw-backup-cron\|openclaw-restore" 2>/dev/null \|\| true)

	while IFS= read -r line; do
	[[ -z "$line" ]] && continue

	[[ "$line" == "$my_pid" ]] && continue
	[[ "$line" == "$my_ppid" ]] && continue

	local pid
	pid=$(echo "$line" \| awk '{print $1}')
	[[ "$pid" == "$my_pid" ]] \|\| [[ "$pid" == "$my_ppid" ]] && continue

	if [[ "$line" =~ backup\.py ]]; then
	local args
	args=$(echo "$line" \| sed 's/.*backup\.py//' \| tr -s ' ')
	if [[ "$args" =~ (^\|[\s/])(backup\|restore)($\|[\s/]) ]] \|\| [[ "$args" =~ \-\-command= ]]; then
	log_info "Backup/restore is currently running: $line"
	return 0
	fi
	fi

	if [[ "$line" =~ openclaw-backup-cron ]] && [[ "$line" != watchdog ]]; then
	log_info "Backup cron is currently running: $line"
	return 0
	fi

	if [[ "$line" =~ openclaw-restore ]]; then
	log_info "Restore is currently running: $line"
	return 0
	fi
	done <<< "$processes"

	return 1
	}

	# Repair function: Fix common issues
	repair_backup_system() {
	log_info "🔧 Attempting to repair backup system..."

	# Check if backup/restore is running before doing any repairs
	if check_backup_running; then
	log_warn "⚠️ Backup or restore is currently running, skipping repair to avoid interference"
	return 1
	fi

	local repairs_made=0

	# Repair 1: Fix script permissions
	log_info "Repair 1: Fixing script permissions..."
	chmod +x /usr/local/bin/openclaw-*.sh 2>/dev/null && {
	log_info "✅ Fixed script permissions"
	((repairs_made++))
	}

	# Repair 2: Recreate backup log directory
	if [[ ! -d "$BACKUP_LOG_DIR" ]]; then
	log_info "Repair 2: Creating backup log directory..."
	mkdir -p "$BACKUP_LOG_DIR"
	touch "$BACKUP_LOG_FILE"
	log_info "✅ Created backup log directory"
	((repairs_made++))
	fi

	# Repair 3: Fix log file permissions
	if [[ -f "$BACKUP_LOG_FILE" ]]; then
	chmod 644 "$BACKUP_LOG_FILE"
	fi

	# Repair 4: Clear Python cache (might fix import issues)
	log_info "Repair 4: Clearing Python cache..."
	find /opt/openclaw-hf -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null \|\| true
	log_info "✅ Cleared Python cache"
	((repairs_made++))

	# Repair 5: Test backup script manually (dry-run only to avoid polluting remote backups)
	log_info "Repair 5: Testing backup script (dry-run)..."
	if check_backup_running; then
	log_warn "⚠️ Backup started while repair was in progress, skipping backup test"
	else
	local test_work_dir
	test_work_dir=$(mktemp -d /tmp/openclaw-backup-test-XXXXXX)
	if timeout 60 bash -c "
	source '$BACKUP_ENV_FILE_PATH' 2>/dev/null \|\| true
	source /etc/profile.d/openclaw-env.sh 2>/dev/null \|\| true
	export HF_TOKEN
	export OPENCLAW_BACKUP_DATASET_REPO='__health_check_test__invalid_repo__'
	export OPENCLAW_BACKUP_WORK_DIR='$test_work_dir'
	export OPENCLAW_BACKUP_KEEP_COUNT='1'
	python3 /opt/openclaw-hf/openclaw_hf/backup.py backup 2>&1
	" >> "$BACKUP_LOG_FILE" 2>&1; then
	log_info "✅ Backup dry-run test successful"
	reset_failed_attempts
	((repairs_made++))
	else
	local test_exit_code=$?
	if [[ -n $(find "$test_work_dir" -name 'openclaw-backup-*.tar.gz' -print -quit 2>/dev/null) ]]; then
	log_info "✅ Backup dry-run test successful (archive created locally)"
	reset_failed_attempts
	((repairs_made++))
	else
	log_error "❌ Backup dry-run test failed (exit code: $test_exit_code)"
	increment_failed_attempts
	fi
	fi
	rm -rf "$test_work_dir" 2>/dev/null \|\| true
	fi

	log_info "🔧 Repair complete. Repairs made: $repairs_made"
	return $((repairs_made == 0 ? 1 : 0))
	}

	# Emergency repair: Reset everything
	emergency_repair() {
	log_warn "🚨 EMERGENCY REPAIR MODE"

	# Clear all caches
	rm -rf /root/.cache/huggingface 2>/dev/null \|\| true

	# Reinstall huggingface_hub if needed
	python3 -m pip install --upgrade --force-reinstall "huggingface_hub[cli]>=0.31.1" 2>/dev/null \|\| {
	log_error "Failed to reinstall huggingface_hub"
	return 1
	}

	# Reset failed attempts
	reset_failed_attempts

	log_info "🚨 Emergency repair complete"
	return 0
	}

	# Main health check
	run_health_check() {
	log_info "========================================"
	log_info "OpenClaw Backup Health Check Starting"
	log_info "========================================"

	load_env

	local total_checks=0
	local passed_checks=0
	local failed_checks=0

	# Check 1: Commands
	((total_checks++))
	if check_commands; then
	((passed_checks++))
	else
	((failed_checks++))
	fi

	# Check 2: Environment variables
	((total_checks++))
	if check_env_vars; then
	((passed_checks++))
	else
	((failed_checks++))
	fi

	# Check 3: HF Connectivity
	((total_checks++))
	if check_hf_connectivity; then
	((passed_checks++))
	else
	((failed_checks++))
	fi

	# Check 4: Dataset access
	((total_checks++))
	if check_dataset_access; then
	((passed_checks++))
	else
	((failed_checks++))
	fi

	# Check 5: Scripts
	((total_checks++))
	if check_backup_scripts; then
	((passed_checks++))
	else
	((failed_checks++))
	fi

	# Check 6: Recent backups
	((total_checks++))
	if check_recent_backups; then
	((passed_checks++))
	else
	((failed_checks++))
	fi

	# Check 7: Disk space
	((total_checks++))
	if check_disk_space; then
	((passed_checks++))
	else
	((failed_checks++))
	fi

	# Summary
	log_info "========================================"
	log_info "Health Check Summary"
	log_info "========================================"
	log_info "Total checks: $total_checks"
	log_info "✅ Passed: $passed_checks"
	log_info "❌ Failed: $failed_checks"

	# Auto-repair if needed
	local failed_attempts
	failed_attempts=$(get_failed_attempts)

	if [[ $failed_checks -gt 0 ]]; then
	log_warn "Some checks failed. Failed attempts: $failed_attempts/$MAX_FAILED_ATTEMPTS"

	if [[ $failed_attempts -ge $MAX_FAILED_ATTEMPTS ]]; then
	log_error "Max failed attempts reached. Running emergency repair..."
	emergency_repair
	else
	repair_backup_system
	fi

	return 1
	else
	log_info "✅ All health checks passed!"
	reset_failed_attempts
	return 0
	fi
	}

	# Show usage
	usage() {
	cat << EOF
	OpenClaw Backup Health Check & Auto-Repair

	Usage: $0 [OPTIONS]

	Options:
	-h, --help Show this help message
	-c, --check Run health check only (no repair)
	-r, --repair Run repair only
	-e, --emergency Run emergency repair
	-v, --verbose Enable verbose output

	Examples:
	$0 Run full health check with auto-repair
	$0 --check Run health check only
	$0 --repair Run repair only
	$0 --emergency Run emergency repair
	EOF
	}

	# Main
	main() {
	# Create log directory
	mkdir -p "$BACKUP_LOG_DIR"

	case "${1:-}" in
	-h\|--help)
	usage
	exit 0
	;;
	-c\|--check)
	load_env
	check_commands && check_env_vars && check_hf_connectivity && \
	check_dataset_access && check_backup_scripts && \
	check_recent_backups && check_disk_space
	exit $?
	;;
	-r\|--repair)
	load_env
	repair_backup_system
	exit $?
	;;
	-e\|--emergency)
	load_env
	emergency_repair
	exit $?
	;;
	-v\|--verbose)
	export OPENCLAW_DEBUG=true
	run_health_check
	exit $?
	;;
	*)
	run_health_check
	exit $?
	;;
	esac
	}

	main "$@"