#!/usr/bin/env bash
#
# OpenClaw Backup Health Check & Auto-Repair Script
# 健康检查与自动修复脚本
#

set -euo pipefail

# Configuration
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BACKUP_ENV_FILE_PATH="${OPENCLAW_BACKUP_ENV_FILE_PATH:-/root/.env.d/openclaw-backup.env}"
BACKUP_LOG_DIR="${OPENCLAW_BACKUP_LOG_DIR:-/var/log/openclaw}"
BACKUP_LOG_FILE="${BACKUP_LOG_DIR}/backup.log"
HEALTH_LOG_FILE="${BACKUP_LOG_DIR}/health-check.log"
MAX_BACKUP_AGE_MINUTES="${OPENCLAW_MAX_BACKUP_AGE_MINUTES:-30}"
MAX_FAILED_ATTEMPTS="${OPENCLAW_MAX_FAILED_ATTEMPTS:-3}"
FAILED_ATTEMPTS_FILE="${BACKUP_LOG_DIR}/.failed-attempts"

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

# Logging functions
log_info() {
    echo -e "${GREEN}[INFO]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" | tee -a "$HEALTH_LOG_FILE"
}

log_warn() {
    echo -e "${YELLOW}[WARN]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" | tee -a "$HEALTH_LOG_FILE"
}

log_error() {
    echo -e "${RED}[ERROR]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" | tee -a "$HEALTH_LOG_FILE"
}

log_debug() {
    if [[ "${OPENCLAW_DEBUG:-false}" == "true" ]]; then
        echo -e "${BLUE}[DEBUG]${NC} $(date '+%Y-%m-%d %H:%M:%S') $*" | tee -a "$HEALTH_LOG_FILE"
    fi
}

# Load environment
load_env() {
    if [[ -f "$BACKUP_ENV_FILE_PATH" ]]; then
        # shellcheck disable=SC1090
        source "$BACKUP_ENV_FILE_PATH"
        log_debug "Loaded environment from $BACKUP_ENV_FILE_PATH"
    else
        log_warn "Environment file not found: $BACKUP_ENV_FILE_PATH"
    fi
}

# Check if required commands exist
check_commands() {
    local missing=()
    
    for cmd in python3 curl; do
        if ! command -v "$cmd" &> /dev/null; then
            missing+=("$cmd")
        fi
    done
    
    if [[ ${#missing[@]} -gt 0 ]]; then
        log_error "Missing required commands: ${missing[*]}"
        return 1
    fi
    
    log_debug "All required commands available"
    return 0
}

# Check environment variables
check_env_vars() {
    local errors=0
    
    # Required variables
    if [[ -z "${HF_TOKEN:-}" ]]; then
        log_error "HF_TOKEN is not set"
        ((errors++))
    else
        log_debug "HF_TOKEN is set"
    fi
    
    if [[ -z "${OPENCLAW_BACKUP_DATASET_REPO:-}" ]]; then
        log_error "OPENCLAW_BACKUP_DATASET_REPO is not set"
        ((errors++))
    else
        log_debug "Dataset repo: $OPENCLAW_BACKUP_DATASET_REPO"
    fi
    
    # Optional variables with defaults
    if [[ -z "${OPENCLAW_BACKUP_CRON:-}" ]]; then
        log_warn "OPENCLAW_BACKUP_CRON not set, using default: */10 * * * *"
    fi
    
    return $errors
}

# Check HuggingFace API connectivity
check_hf_connectivity() {
    log_info "Checking HuggingFace connectivity..."

    if [[ "${OPENCLAW_BACKUP_HEALTH_CHECK_ENABLED:-false}" == "false" ]]; then
        log_info "Skipping HF connectivity check (OPENCLAW_BACKUP_HEALTH_CHECK_ENABLED=false, network checks disabled)"
        return 0
    fi

    local max_retries=3
    local retry_count=0
    local curl_opts="-sSf"

    while [[ $retry_count -lt $max_retries ]]; do
        local http_code retry_after wait_time
        local response_headers
        response_headers=$(curl $curl_opts -D- \
            --connect-timeout 10 \
            --max-time 30 \
            "https://huggingface.co/api/whoami" \
            -H "Authorization: Bearer ${HF_TOKEN:-}" \
            -o /dev/null 2>&1) || true
        http_code=$(echo "$response_headers" | tail -1)
        retry_after=$(echo "$response_headers" | grep -i "retry-after:" | awk '{print $2}' | tr -d '\r')

        if [[ "$http_code" == "200" ]]; then
            log_info "✅ HuggingFace API is accessible"
            return 0
        elif [[ "$http_code" == "401" ]]; then
            log_warn "HF API returned 401 (Unauthorized) - token may be invalid"
            return 1
        elif [[ "$http_code" == "429" ]]; then
            wait_time=60
            if [[ -n "$retry_after" ]] && [[ "$retry_after" =~ ^[0-9]+$ ]]; then
                wait_time=$retry_after
                log_warn "HF API returned 429 (Rate Limited) - Retry-After: ${wait_time}s"
            else
                wait_time=$((60 * (2 ** retry_count)))
                log_warn "HF API returned 429 (Rate Limited) - using exponential backoff: ${wait_time}s"
            fi

            if [[ $retry_count -lt $((max_retries - 1)) ]]; then
                log_info "Waiting ${wait_time} seconds before retry $((retry_count + 2))/${max_retries}..."
                sleep "$wait_time"
            fi
            ((retry_count++))
            continue
        fi

        ((retry_count++))
        log_warn "HuggingFace API check failed (attempt $retry_count/$max_retries, code: $http_code)"
        sleep 5
    done

    log_error "❌ Cannot connect to HuggingFace API"
    log_warn "If this is a network issue, you can set OPENCLAW_BACKUP_HEALTH_CHECK_ENABLED=false to skip"
    return 1
}

# Check dataset access
check_dataset_access() {
    local dataset="${OPENCLAW_BACKUP_DATASET_REPO:-}"
    
    if [[ -z "$dataset" ]]; then
        log_error "Cannot check dataset: OPENCLAW_BACKUP_DATASET_REPO not set"
        return 1
    fi
    
    log_info "Checking dataset access: $dataset"
    
    local result
    result=$(python3 << PYEOF 2>&1
import sys
import os
sys.path.insert(0, '/opt/openclaw-hf')

try:
    from huggingface_hub import HfApi
    api = HfApi(token=os.getenv('HF_TOKEN'))
    
    # Try to get dataset info
    info = api.dataset_info("$dataset")
    print(f"SUCCESS: Dataset exists, id={info.id}")
    
    # Try to list files
    files = list(api.list_repo_files("$dataset", repo_type='dataset'))
    backup_files = [f for f in files if 'openclaw-backup' in f]
    print(f"INFO: Found {len(backup_files)} backup files")
    
except Exception as e:
    print(f"ERROR: {e}")
    sys.exit(1)
PYEOF
)
    
    if [[ "$result" == SUCCESS:* ]]; then
        log_info "✅ Dataset access OK: ${result#SUCCESS: }"
        return 0
    else
        log_error "❌ Dataset access failed: $result"
        return 1
    fi
}

# Check backup script integrity
check_backup_scripts() {
    local scripts=(
        "/usr/local/bin/openclaw-backup-cron.sh"
        "/usr/local/bin/openclaw-restore.sh"
        "/opt/openclaw-hf/openclaw_hf/backup.py"
    )
    
    local errors=0
    
    for script in "${scripts[@]}"; do
        if [[ ! -f "$script" ]]; then
            log_error "❌ Script not found: $script"
            ((errors++))
        elif [[ ! -x "$script" ]] && [[ "$script" != *.py ]]; then
            log_warn "⚠️ Script not executable: $script"
            chmod +x "$script" 2>/dev/null || {
                log_error "❌ Cannot make $script executable"
                ((errors++))
            }
        else
            log_debug "✅ Script OK: $script"
        fi
    done
    
    return $errors
}

# Check backup log for recent failures
check_recent_backups() {
    log_info "Checking recent backup status..."
    
    if [[ ! -f "$BACKUP_LOG_FILE" ]]; then
        log_warn "⚠️ Backup log not found: $BACKUP_LOG_FILE"
        return 1
    fi
    
    # Check for recent successful backup
    # Match actual success indicators, not error messages containing "backup complete"
    local last_success
    last_success=$(grep -iE "(backup uploaded|backup complete success)" "$BACKUP_LOG_FILE" 2>/dev/null | tail -1)
    
    if [[ -z "$last_success" ]]; then
        log_warn "⚠️ No successful backup found in logs"
        return 1
    fi
    
    # Extract timestamp and calculate age
    # Supports formats: [2026-04-24T04:00:02], 2026-04-24T04:00:02, 2026-04-24 04:00:02
    local log_time
    log_time=$(echo "$last_success" | grep -oE '[0-9]{4}-[0-9]{2}-[0-9]{2}[T ][0-9]{2}:[0-9]{2}:[0-9]{2}' | head -1)

    if [[ -n "$log_time" ]]; then
        local log_epoch now_epoch age_minutes
        # Normalize space separator to T for consistent parsing
        log_time=${log_time/ /T}
        log_epoch=$(date -d "$log_time" +%s 2>/dev/null)
        now_epoch=$(date +%s)
        age_minutes=$(( (now_epoch - log_epoch) / 60 ))
        
        if [[ $age_minutes -gt $MAX_BACKUP_AGE_MINUTES ]]; then
            log_warn "⚠️ Last successful backup was $age_minutes minutes ago (max: $MAX_BACKUP_AGE_MINUTES)"
            return 1
        else
            log_info "✅ Last backup was $age_minutes minutes ago"
        fi
    fi
    
    # Check for recent errors
    local recent_errors
    recent_errors=$(grep -i "error\|failed\|exception" "$BACKUP_LOG_FILE" 2>/dev/null | tail -5)
    
    if [[ -n "$recent_errors" ]]; then
        log_warn "⚠️ Recent errors found in backup log:"
        echo "$recent_errors" | while read -r line; do
            log_warn "  $line"
        done
    fi
    
    return 0
}

# Check disk space
check_disk_space() {
    log_info "Checking disk space..."
    
    local usage
    usage=$(df -h /root | awk 'NR==2 {print $5}' | tr -d '%')
    
    if [[ "$usage" -gt 90 ]]; then
        log_error "❌ Disk usage critical: ${usage}%"
        return 1
    elif [[ "$usage" -gt 80 ]]; then
        log_warn "⚠️ Disk usage high: ${usage}%"
    else
        log_info "✅ Disk usage OK: ${usage}%"
    fi
    
    return 0
}

# Get failed attempts count
get_failed_attempts() {
    if [[ -f "$FAILED_ATTEMPTS_FILE" ]]; then
        cat "$FAILED_ATTEMPTS_FILE"
    else
        echo "0"
    fi
}

# Increment failed attempts
increment_failed_attempts() {
    local current
    current=$(get_failed_attempts)
    echo $((current + 1)) > "$FAILED_ATTEMPTS_FILE"
}

# Reset failed attempts
reset_failed_attempts() {
    echo "0" > "$FAILED_ATTEMPTS_FILE"
}

# Check if backup or restore is currently running
check_backup_running() {
    local my_pid=$$
    local my_ppid=$PPID

    local processes
    processes=$(pgrep -af "backup\.py|openclaw-backup-cron|openclaw-restore" 2>/dev/null || true)

    while IFS= read -r line; do
        [[ -z "$line" ]] && continue

        [[ "$line" == *"$my_pid"* ]] && continue
        [[ "$line" == *"$my_ppid"* ]] && continue

        local pid
        pid=$(echo "$line" | awk '{print $1}')
        [[ "$pid" == "$my_pid" ]] || [[ "$pid" == "$my_ppid" ]] && continue

        if [[ "$line" =~ backup\.py ]]; then
            local args
            args=$(echo "$line" | sed 's/.*backup\.py//' | tr -s ' ')
            if [[ "$args" =~ (^|[\s/])(backup|restore)($|[\s/]) ]] || [[ "$args" =~ \-\-command= ]]; then
                log_info "Backup/restore is currently running: $line"
                return 0
            fi
        fi

        if [[ "$line" =~ openclaw-backup-cron ]] && [[ "$line" != *watchdog* ]]; then
            log_info "Backup cron is currently running: $line"
            return 0
        fi

        if [[ "$line" =~ openclaw-restore ]]; then
            log_info "Restore is currently running: $line"
            return 0
        fi
    done <<< "$processes"

    return 1
}

# Repair function: Fix common issues
repair_backup_system() {
    log_info "🔧 Attempting to repair backup system..."
    
    # Check if backup/restore is running before doing any repairs
    if check_backup_running; then
        log_warn "⚠️ Backup or restore is currently running, skipping repair to avoid interference"
        return 1
    fi
    
    local repairs_made=0
    
    # Repair 1: Fix script permissions
    log_info "Repair 1: Fixing script permissions..."
    chmod +x /usr/local/bin/openclaw-*.sh 2>/dev/null && {
        log_info "✅ Fixed script permissions"
        ((repairs_made++))
    }
    
    # Repair 2: Recreate backup log directory
    if [[ ! -d "$BACKUP_LOG_DIR" ]]; then
        log_info "Repair 2: Creating backup log directory..."
        mkdir -p "$BACKUP_LOG_DIR"
        touch "$BACKUP_LOG_FILE"
        log_info "✅ Created backup log directory"
        ((repairs_made++))
    fi
    
    # Repair 3: Fix log file permissions
    if [[ -f "$BACKUP_LOG_FILE" ]]; then
        chmod 644 "$BACKUP_LOG_FILE"
    fi
    
    # Repair 4: Clear Python cache (might fix import issues)
    log_info "Repair 4: Clearing Python cache..."
    find /opt/openclaw-hf -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
    log_info "✅ Cleared Python cache"
    ((repairs_made++))
    
    # Repair 5: Test backup script manually (dry-run only to avoid polluting remote backups)
    log_info "Repair 5: Testing backup script (dry-run)..."
    if check_backup_running; then
        log_warn "⚠️ Backup started while repair was in progress, skipping backup test"
    else
        local test_work_dir
        test_work_dir=$(mktemp -d /tmp/openclaw-backup-test-XXXXXX)
        if timeout 60 bash -c "
            source '$BACKUP_ENV_FILE_PATH' 2>/dev/null || true
            source /etc/profile.d/openclaw-env.sh 2>/dev/null || true
            export HF_TOKEN
            export OPENCLAW_BACKUP_DATASET_REPO='__health_check_test__invalid_repo__'
            export OPENCLAW_BACKUP_WORK_DIR='$test_work_dir'
            export OPENCLAW_BACKUP_KEEP_COUNT='1'
            python3 /opt/openclaw-hf/openclaw_hf/backup.py backup 2>&1
        " >> "$BACKUP_LOG_FILE" 2>&1; then
            log_info "✅ Backup dry-run test successful"
            reset_failed_attempts
            ((repairs_made++))
        else
            local test_exit_code=$?
            if [[ -n $(find "$test_work_dir" -name 'openclaw-backup-*.tar.gz' -print -quit 2>/dev/null) ]]; then
                log_info "✅ Backup dry-run test successful (archive created locally)"
                reset_failed_attempts
                ((repairs_made++))
            else
                log_error "❌ Backup dry-run test failed (exit code: $test_exit_code)"
                increment_failed_attempts
            fi
        fi
        rm -rf "$test_work_dir" 2>/dev/null || true
    fi
    
    log_info "🔧 Repair complete. Repairs made: $repairs_made"
    return $((repairs_made == 0 ? 1 : 0))
}

# Emergency repair: Reset everything
emergency_repair() {
    log_warn "🚨 EMERGENCY REPAIR MODE"
    
    # Clear all caches
    rm -rf /root/.cache/huggingface 2>/dev/null || true
    
    # Reinstall huggingface_hub if needed
    python3 -m pip install --upgrade --force-reinstall "huggingface_hub[cli]>=0.31.1" 2>/dev/null || {
        log_error "Failed to reinstall huggingface_hub"
        return 1
    }
    
    # Reset failed attempts
    reset_failed_attempts
    
    log_info "🚨 Emergency repair complete"
    return 0
}

# Main health check
run_health_check() {
    log_info "========================================"
    log_info "OpenClaw Backup Health Check Starting"
    log_info "========================================"
    
    load_env
    
    local total_checks=0
    local passed_checks=0
    local failed_checks=0
    
    # Check 1: Commands
    ((total_checks++))
    if check_commands; then
        ((passed_checks++))
    else
        ((failed_checks++))
    fi
    
    # Check 2: Environment variables
    ((total_checks++))
    if check_env_vars; then
        ((passed_checks++))
    else
        ((failed_checks++))
    fi
    
    # Check 3: HF Connectivity
    ((total_checks++))
    if check_hf_connectivity; then
        ((passed_checks++))
    else
        ((failed_checks++))
    fi
    
    # Check 4: Dataset access
    ((total_checks++))
    if check_dataset_access; then
        ((passed_checks++))
    else
        ((failed_checks++))
    fi
    
    # Check 5: Scripts
    ((total_checks++))
    if check_backup_scripts; then
        ((passed_checks++))
    else
        ((failed_checks++))
    fi
    
    # Check 6: Recent backups
    ((total_checks++))
    if check_recent_backups; then
        ((passed_checks++))
    else
        ((failed_checks++))
    fi
    
    # Check 7: Disk space
    ((total_checks++))
    if check_disk_space; then
        ((passed_checks++))
    else
        ((failed_checks++))
    fi
    
    # Summary
    log_info "========================================"
    log_info "Health Check Summary"
    log_info "========================================"
    log_info "Total checks: $total_checks"
    log_info "✅ Passed: $passed_checks"
    log_info "❌ Failed: $failed_checks"
    
    # Auto-repair if needed
    local failed_attempts
    failed_attempts=$(get_failed_attempts)
    
    if [[ $failed_checks -gt 0 ]]; then
        log_warn "Some checks failed. Failed attempts: $failed_attempts/$MAX_FAILED_ATTEMPTS"
        
        if [[ $failed_attempts -ge $MAX_FAILED_ATTEMPTS ]]; then
            log_error "Max failed attempts reached. Running emergency repair..."
            emergency_repair
        else
            repair_backup_system
        fi
        
        return 1
    else
        log_info "✅ All health checks passed!"
        reset_failed_attempts
        return 0
    fi
}

# Show usage
usage() {
    cat << EOF
OpenClaw Backup Health Check & Auto-Repair

Usage: $0 [OPTIONS]

Options:
    -h, --help          Show this help message
    -c, --check         Run health check only (no repair)
    -r, --repair        Run repair only
    -e, --emergency     Run emergency repair
    -v, --verbose       Enable verbose output

Examples:
    $0                  Run full health check with auto-repair
    $0 --check          Run health check only
    $0 --repair         Run repair only
    $0 --emergency      Run emergency repair
EOF
}

# Main
main() {
    # Create log directory
    mkdir -p "$BACKUP_LOG_DIR"
    
    case "${1:-}" in
        -h|--help)
            usage
            exit 0
            ;;
        -c|--check)
            load_env
            check_commands && check_env_vars && check_hf_connectivity && \
                check_dataset_access && check_backup_scripts && \
                check_recent_backups && check_disk_space
            exit $?
            ;;
        -r|--repair)
            load_env
            repair_backup_system
            exit $?
            ;;
        -e|--emergency)
            load_env
            emergency_repair
            exit $?
            ;;
        -v|--verbose)
            export OPENCLAW_DEBUG=true
            run_health_check
            exit $?
            ;;
        *)
            run_health_check
            exit $?
            ;;
    esac
}

main "$@"