#!/bin/bash # SSH服务看门狗脚本 - 确保SSH服务持续可用 # 功能:监控SSH服务状态,异常时自动恢复,并记录详细日志 # 版本:4.0 - 增强版:详细错误日志输出、配置完整性校验、重启频率限制、指数退避 # 配置参数(可通过环境变量覆盖) SSH_PORT=${SSH_PORT:-22} CHECK_INTERVAL=${CHECK_INTERVAL:-30} MAX_RETRIES=${MAX_RETRIES:-3} LOG_FILE="/var/log/ssh_watchdog.log" SSH_ERROR_LOG="/var/log/ssh_error.log" MAX_LOG_SIZE=10485760 # 10MB SSH_CONFIG="/etc/ssh/sshd_config" SSH_BACKUP_CONFIG="/etc/ssh/sshd_config.backup" SSH_CONFIG_CHECKSUM="/etc/ssh/sshd_config.checksum" NOTIFICATION_ENABLED=${NOTIFICATION_ENABLED:-false} NOTIFICATION_WEBHOOK=${NOTIFICATION_WEBHOOK:-} MONITOR_API_URL=${MONITOR_API_URL:-"http://localhost:7680/api/terminal"} # 重启频率限制 MAX_RESTARTS_PER_HOUR=${MAX_RESTARTS_PER_HOUR:-10} restart_count=0 last_restart_hour=$(date +%Y%m%d%H) # 指数退避配置 BACKOFF_BASE=${BACKOFF_BASE:-2} BACKOFF_MAX=${BACKOFF_MAX:-300} # 最大退避时间(秒) current_backoff=1 # SSH错误日志函数 log_ssh_error() { local level=$1 local message=$2 local timestamp=$(date '+%Y-%m-%d %H:%M:%S') local error_file="$SSH_ERROR_LOG" # 确保错误日志目录存在 mkdir -p "$(dirname "$error_file")" touch "$error_file" # 写入错误日志 echo "[$timestamp] [$level] $message" >> "$error_file" # 同时记录到主日志 log_message "$level" "$message" } # 颜色定义(用于日志) RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # No Color # 确保日志目录存在 mkdir -p "$(dirname "$LOG_FILE")" touch "$LOG_FILE" # ========== 新增:配置完整性校验 ========== # 计算配置文件校验和 calculate_config_checksum() { sha256sum "$SSH_CONFIG" 2>/dev/null | awk '{print $1}' } # 保存配置文件校验和 save_config_checksum() { local checksum=$(calculate_config_checksum) if [ -n "$checksum" ]; then echo "$checksum" > "$SSH_CONFIG_CHECKSUM" log_message "DEBUG" "配置文件校验和已保存: $checksum" fi } # 验证配置文件完整性 verify_config_integrity() { if [ ! -f "$SSH_CONFIG_CHECKSUM" ]; then log_message "WARN" "配置文件校验和不存在,跳过完整性检查" save_config_checksum return 0 fi local current_checksum=$(calculate_config_checksum) local saved_checksum=$(cat "$SSH_CONFIG_CHECKSUM" 2>/dev/null) if [ "$current_checksum" != "$saved_checksum" ]; then log_message "ERROR" "配置文件可能被篡改!校验和不匹配" log_message "ERROR" "期望: $saved_checksum" log_message "ERROR" "当前: $current_checksum" return 1 fi return 0 } # ========== 新增:重启频率限制 ========== # 检查是否超过重启频率限制 check_restart_rate_limit() { local current_hour=$(date +%Y%m%d%H) # 如果进入新的一小时,重置计数器 if [ "$current_hour" != "$last_restart_hour" ]; then restart_count=0 last_restart_hour=$current_hour fi if [ $restart_count -ge $MAX_RESTARTS_PER_HOUR ]; then log_message "ERROR" "重启频率超限!过去1小时内已重启 $restart_count 次(最大 $MAX_RESTARTS_PER_HOUR 次)" return 1 fi restart_count=$((restart_count + 1)) log_message "DEBUG" "重启计数: $restart_count/$MAX_RESTARTS_PER_HOUR (小时: $current_hour)" return 0 } # ========== 新增:指数退避 ========== # 计算退避时间 calculate_backoff() { local backoff=$(( BACKOFF_BASE ** current_backoff )) if [ $backoff -gt $BACKOFF_MAX ]; then backoff=$BACKOFF_MAX fi echo $backoff } # 执行指数退避等待 apply_backoff() { local backoff_time=$(calculate_backoff) log_message "WARN" "应用指数退避:等待 ${backoff_time} 秒后重试..." sleep $backoff_time current_backoff=$((current_backoff + 1)) } # 重置退避计数器(成功时调用) reset_backoff() { if [ $current_backoff -gt 1 ]; then log_message "INFO" "恢复成功,重置退避计数器" fi current_backoff=1 } # ========== 新增:详细SSH错误日志捕获 ========== # 捕获SSH守护进程的详细错误信息 capture_sshd_errors() { local error_output_file="/tmp/sshd_error_output_$(date +%Y%m%d_%H%M%S).log" log_ssh_error "INFO" "正在捕获SSH守护进程的详细错误信息..." { echo "=== SSH守护进程错误捕获 ===" echo "捕获时间: $(date '+%Y-%m-%d %H:%M:%S')" echo "" echo "=== 1. SSH配置测试结果 ===" sshd -t 2>&1 | tee -a "$error_output_file" echo "" echo "=== 2. SSH守护进程调试输出(前10行)===" # 运行sshd在调试模式,只运行几秒钟 timeout 3 sshd -d -D 2>&1 | head -10 | tee -a "$error_output_file" echo "" echo "=== 3. SSH配置文件的语法检查 ===" sshd -T 2>&1 | head -50 | tee -a "$error_output_file" echo "" } >> "$error_output_file" 2>&1 log_ssh_error "INFO" "SSH错误详情已保存到: $error_output_file" echo "$error_output_file" } # 收集SSH相关的系统日志 collect_ssh_system_logs() { local log_file="/tmp/ssh_system_logs_$(date +%Y%m%d_%H%M%S).log" log_ssh_error "INFO" "正在收集SSH相关的系统日志..." { echo "=== SSH系统日志收集 ===" echo "收集时间: $(date '+%Y-%m-%d %H:%M:%S')" echo "" echo "=== 1. 系统日志中的SSH相关记录(最近50行)===" if command -v journalctl &> /dev/null; then journalctl -u ssh --no-pager -n 50 2>&1 || echo "无法获取journalctl日志" elif [ -f "/var/log/auth.log" ]; then tail -50 /var/log/auth.log 2>&1 | grep -i ssh || echo "无SSH相关日志" elif [ -f "/var/log/secure" ]; then tail -50 /var/log/secure 2>&1 | grep -i ssh || echo "无SSH相关日志" else echo "未找到SSH日志文件" fi echo "" echo "=== 2. SSH进程当前状态 ===" ps aux | grep sshd | grep -v grep || echo "无SSH进程运行" echo "" echo "=== 3. 网络连接状态(SSH端口)===" if command -v netstat &> /dev/null; then netstat -tulnp 2>&1 | grep ":$SSH_PORT " || echo "端口 $SSH_PORT 无监听" elif command -v ss &> /dev/null; then ss -tulnp 2>&1 | grep ":$SSH_PORT " || echo "端口 $SSH_PORT 无监听" fi echo "" echo "=== 4. 最近的系统消息(dmesg)===" if command -v dmesg &> /dev/null; then dmesg | tail -50 2>&1 | grep -i "ssh\|network\|connection" || echo "无相关内核日志" fi echo "" echo "=== 5. SSH配置文件权限检查 ===" ls -la /etc/ssh/ 2>&1 || echo "无法检查/etc/ssh权限" echo "" echo "=== 6. 文件系统状态 ===" df -h /etc/ssh 2>&1 || echo "无法检查文件系统" echo "" } >> "$log_file" 2>&1 log_ssh_error "INFO" "SSH系统日志已保存到: $log_file" echo "$log_file" } # 显示最近的SSH错误日志 show_recent_ssh_errors() { local lines=${1:-50} # 默认显示50行 echo "=== 最近的SSH错误日志(最后 $lines 行)===" echo "" if [ -f "$SSH_ERROR_LOG" ]; then tail -n "$lines" "$SSH_ERROR_LOG" else echo "SSH错误日志文件不存在: $SSH_ERROR_LOG" fi echo "" echo "=== 最近的系统日志中的SSH错误 ===" if command -v journalctl &> /dev/null; then journalctl -u ssh --no-pager -n "$lines" 2>&1 | grep -i "error\|fail\|denied" || echo "无错误日志" elif [ -f "/var/log/auth.log" ]; then tail -n "$lines" /var/log/auth.log 2>&1 | grep -i "error\|fail\|denied" || echo "无错误日志" fi } # 增强的SSH配置测试(带详细错误输出) test_ssh_config_detailed() { local error_file="/tmp/ssh_config_test_$(date +%Y%m%d_%H%M%S).log" log_ssh_error "INFO" "正在详细测试SSH配置文件..." { echo "=== SSH配置详细测试 ===" echo "测试时间: $(date '+%Y-%m-%d %H:%M:%S')" echo "" echo "=== 1. 配置语法检查 ===" sshd -t 2>&1 local syntax_result=$? echo "语法检查结果: $syntax_result" echo "" if [ $syntax_result -ne 0 ]; then echo "=== 2. 配置详细信息(sshd -T)===" sshd -T 2>&1 | head -100 echo "" echo "=== 3. 配置文件内容(敏感信息已隐藏)===" if [ -f "$SSH_CONFIG" ]; then grep -v "^#" "$SSH_CONFIG" | grep -v "^$" | sed 's/PasswordAuthentication.*/PasswordAuthentication [HIDDEN]/g' | head -50 fi echo "" fi } > "$error_file" 2>&1 local errors=$(cat "$error_file") log_ssh_error "ERROR" "SSH配置测试发现错误:" log_ssh_error "ERROR" "$errors" echo "$error_file" } # ========== 新增:二进制完整性检查 ========== # 检查SSH二进制文件完整性 check_sshd_binary() { local sshd_path="" if [ -x "/usr/sbin/sshd" ]; then sshd_path="/usr/sbin/sshd" elif [ -x "/usr/bin/sshd" ]; then sshd_path="/usr/bin/sshd" else log_message "ERROR" "找不到sshd可执行文件" return 1 fi # 检查文件是否存在且可执行 if [ ! -x "$sshd_path" ]; then log_message "ERROR" "sshd二进制文件不可执行: $sshd_path" return 1 fi # 检查依赖库(使用ldd) if command -v ldd &> /dev/null; then local missing_libs=$(ldd "$sshd_path" 2>&1 | grep "not found" || true) if [ -n "$missing_libs" ]; then log_message "ERROR" "sshd依赖库缺失:" echo "$missing_libs" | while read line; do log_message "ERROR" " $line" done return 1 fi fi # 尝试运行sshd -t(配置测试) if ! "$sshd_path" -t 2>&1; then log_message "ERROR" "sshd配置测试失败" return 1 fi log_message "DEBUG" "sshd二进制文件检查通过: $sshd_path" return 0 } # ========== 新增:详细诊断 ========== # 收集启动失败的详细诊断信息 collect_startup_diagnostics() { local diag_file="/tmp/ssh_startup_diagnostics_$(date +%Y%m%d_%H%M%S).log" log_message "INFO" "收集启动诊断信息: $diag_file" { echo "=== SSH启动诊断报告 ===" echo "时间: $(date '+%Y-%m-%d %H:%M:%S')" echo "" echo "=== SSH二进制文件 ===" which sshd 2>/dev/null || echo "sshd未找到在PATH中" ls -la /usr/sbin/sshd /usr/bin/sshd 2>/dev/null || echo "sshd二进制文件不存在" file /usr/sbin/sshd /usr/bin/sshd 2>/dev/null || echo "无法检查文件类型" echo "" echo "=== 依赖库检查 ===" if command -v ldd &> /dev/null; then ldd /usr/sbin/sshd 2>/dev/null || ldd /usr/bin/sshd 2>/dev/null || echo "无法检查依赖库" fi echo "" echo "=== 配置文件检查 ===" if [ -f "$SSH_CONFIG" ]; then echo "配置文件存在: $SSH_CONFIG" echo "配置文件权限: $(ls -la "$SSH_CONFIG")" echo "配置文件大小: $(stat -c%s "$SSH_CONFIG" 2>/dev/null || echo '未知') 字节" echo "" echo "=== 配置内容(敏感信息已隐藏)===" grep -v "^#" "$SSH_CONFIG" | grep -v "^$" | sed 's/PasswordAuthentication.*/PasswordAuthentication [HIDDEN]/g' | head -50 else echo "配置文件不存在: $SSH_CONFIG" fi echo "" echo "=== 端口占用检查 ===" if command -v netstat &> /dev/null; then netstat -tulnp 2>/dev/null | grep ":$SSH_PORT " || echo "端口 $SSH_PORT 未被占用" elif command -v ss &> /dev/null; then ss -tulnp 2>/dev/null | grep ":$SSH_PORT " || echo "端口 $SSH_PORT 未被占用" fi echo "" echo "=== 最近的错误日志 ===" if [ -f "/var/log/auth.log" ]; then tail -50 /var/log/auth.log 2>/dev/null | grep -i ssh || echo "无SSH相关日志" elif [ -f "/var/log/secure" ]; then tail -50 /var/log/secure 2>/dev/null | grep -i ssh || echo "无SSH相关日志" elif command -v journalctl &> /dev/null; then journalctl -u ssh --no-pager -n 50 2>/dev/null || echo "无法获取journalctl日志" fi echo "" echo "=== 文件系统检查 ===" df -h /etc/ssh 2>/dev/null || echo "无法检查文件系统" echo "" echo "=== 权限检查 ===" ls -la /etc/ssh/ 2>/dev/null || echo "无法检查/etc/ssh权限" ls -la /var/run/ 2>/dev/null | grep ssh || echo "无法检查/var/run/ssh" echo "" } > "$diag_file" 2>&1 log_message "INFO" "启动诊断信息已保存: $diag_file" echo "$diag_file" } # 日志函数 log_message() { local level=$1 local message=$2 local timestamp=$(date '+%Y-%m-%d %H:%M:%S') # 日志轮转:如果日志文件超过最大大小,则备份并创建新文件 if [ -f "$LOG_FILE" ]; then local file_size=$(stat -c%s "$LOG_FILE" 2>/dev/null || echo "0") if [ "$file_size" -gt $MAX_LOG_SIZE ]; then mv "$LOG_FILE" "${LOG_FILE}.$(date +%Y%m%d_%H%M%S)" 2>/dev/null touch "$LOG_FILE" fi fi # 写入日志文件(使用printf确保立即刷新) printf "[%s] [%s] %s\n" "$timestamp" "$level" "$message" >> "$LOG_FILE" # 同时输出到控制台(如果控制台支持颜色,则使用颜色) if [ -t 1 ]; then # 终端环境:使用颜色 case $level in "INFO") printf "${GREEN}[%s] [INFO] %s${NC}\n" "$timestamp" "$message" >&1 ;; "WARN") printf "${YELLOW}[%s] [WARN] %s${NC}\n" "$timestamp" "$message" >&1 ;; "ERROR") printf "${RED}[%s] [ERROR] %s${NC}\n" "$timestamp" "$message" >&2 ;; "DEBUG") printf "${BLUE}[%s] [DEBUG] %s${NC}\n" "$timestamp" "$message" >&1 ;; *) printf "[%s] [%s] %s\n" "$timestamp" "$level" "$message" >&1 ;; esac else # 非终端环境(如Docker logs):不使用颜色,输出到stdout/stderr if [ "$level" = "ERROR" ]; then printf "[%s] [%s] %s\n" "$timestamp" "$level" "$message" >&2 else printf "[%s] [%s] %s\n" "$timestamp" "$level" "$message" >&1 fi fi } # 发送通知 send_notification() { local subject=$1 local message=$2 if [ "$NOTIFICATION_ENABLED" != "true" ]; then return 0 fi log_message "INFO" "发送通知: $subject" # Webhook通知(支持企业微信、钉钉等) if [ -n "$NOTIFICATION_WEBHOOK" ]; then local payload=$(cat <> "$LOG_FILE" 2>&1 fi # 可扩展:邮件通知、短信通知等 } # 收集诊断信息 collect_diagnostics() { local diag_file="/tmp/ssh_diagnostics_$(date +%Y%m%d_%H%M%S).log" log_message "INFO" "收集诊断信息到: $diag_file" { echo "=== SSH服务诊断报告 ===" echo "收集时间: $(date '+%Y-%m-%d %H:%M:%S')" echo "" echo "=== 系统信息 ===" uname -a echo "" echo "=== SSH进程状态 ===" ps aux | grep sshd | grep -v grep || echo "无SSH进程运行" echo "" echo "=== 网络连接状态 ===" netstat -tulnp 2>/dev/null | grep :$SSH_PORT || echo "端口 $SSH_PORT 无监听" echo "" echo "=== SSH配置检查 ===" if [ -f "$SSH_CONFIG" ]; then sshd -t 2>&1 || echo "SSH配置文件有错误" else echo "SSH配置文件不存在: $SSH_CONFIG" fi echo "" echo "=== 系统资源状态 ===" echo "内存使用:" free -h echo "" echo "磁盘使用:" df -h echo "" echo "系统负载:" uptime echo "" echo "=== 最近的系统日志(SSH相关)===" if command -v journalctl &> /dev/null; then journalctl -u ssh --no-pager -n 50 2>/dev/null || echo "无法获取journalctl日志" else tail -100 /var/log/auth.log 2>/dev/null || tail -100 /var/log/secure 2>/dev/null || echo "无法获取SSH日志" fi echo "" echo "=== 防火墙状态 ===" if command -v iptables &> /dev/null; then iptables -L -n 2>/dev/null | head -50 fi if command -v ufw &> /dev/null; then ufw status 2>/dev/null fi echo "" } > "$diag_file" 2>&1 log_message "INFO" "诊断信息已保存到: $diag_file" echo "$diag_file" # 返回诊断文件路径 } # 检查SSH服务进程是否存在 check_ssh_process() { if pgrep -x "sshd" > /dev/null; then return 0 # SSH进程存在 else return 1 # SSH进程不存在 fi } # 检查SSH端口是否监听 check_ssh_port() { if netstat -tuln | grep -q ":$SSH_PORT "; then return 0 # 端口正在监听 else return 1 # 端口未监听 fi } # 检查SSH服务是否响应连接 check_ssh_response() { # 使用timeout和bash的/dev/tcp进行简单连接测试 timeout 5 bash -c "/dev/null return $? } # 检查系统资源(内存、磁盘空间) check_system_resources() { local memory_usage=$(free | grep Mem | awk '{print ($3/$2) * 100}' | cut -d. -f1) local disk_usage=$(df / | tail -1 | awk '{print $5}' | sed 's/%//g') if [ "$memory_usage" -gt 90 ]; then log_message "WARN" "内存使用率过高: ${memory_usage}%" return 1 fi if [ "$disk_usage" -gt 90 ]; then log_message "WARN" "磁盘使用率过高: ${disk_usage}%" return 1 fi return 0 } # 尝试修复SSH服务 repair_ssh_service() { log_message "INFO" "开始尝试修复SSH服务..." # 0. 检查重启频率限制 if ! check_restart_rate_limit; then send_notification "SSH重启频率超限" "过去1小时内已尝试重启 $MAX_RESTARTS_PER_HOUR 次,暂停自动重启" return 1 fi # 1. 收集修复前的诊断信息 local diag_file=$(collect_diagnostics) send_notification "SSH服务异常" "SSH服务检测到异常,开始尝试修复。诊断信息: $diag_file" # 2. 停止现有的SSH服务(如果存在) log_message "INFO" "停止现有的SSH服务..." killall sshd 2>/dev/null sleep 2 # 3. 检查SSH二进制文件完整性 log_message "INFO" "检查SSH二进制文件完整性..." if ! check_sshd_binary; then log_message "ERROR" "SSH二进制文件检查失败,尝试收集更多信息..." local startup_diag=$(collect_startup_diagnostics) send_notification "SSH二进制文件损坏" "SSH二进制文件检查失败,诊断信息: $startup_diag" fi # 4. 验证配置完整性 log_message "INFO" "验证SSH配置完整性..." if ! verify_config_integrity; then log_ssh_error "WARN" "配置完整性验证失败,使用备份配置" if [ -f "$SSH_BACKUP_CONFIG" ]; then cp "$SSH_BACKUP_CONFIG" "$SSH_CONFIG" log_message "INFO" "已恢复备份配置" fi fi # 5. 检查SSH配置(带详细错误输出) log_message "INFO" "检查SSH配置文件..." if ! sshd -t 2>> "$SSH_ERROR_LOG"; then log_ssh_error "ERROR" "SSH配置文件有错误,详细信息:" local config_error_file=$(test_ssh_config_detailed) log_ssh_error "ERROR" "配置文件测试详情已保存到: $config_error_file" # 配置修复逻辑:使用备份配置或生成最小配置 if [ -f "$SSH_BACKUP_CONFIG" ]; then log_message "INFO" "使用备份配置文件: $SSH_BACKUP_CONFIG" cp "$SSH_BACKUP_CONFIG" "$SSH_CONFIG" else log_message "WARN" "备份配置不存在,创建最小配置" create_minimal_ssh_config fi # 再次检查配置 if ! sshd -t 2>> "$SSH_ERROR_LOG"; then log_ssh_error "ERROR" "配置修复失败,使用默认配置" create_default_ssh_config fi fi # 6. 备份当前配置(如果配置有效) if sshd -t 2>/dev/null; then cp "$SSH_CONFIG" "$SSH_BACKUP_CONFIG" 2>/dev/null save_config_checksum log_message "INFO" "SSH配置已备份到: $SSH_BACKUP_CONFIG" fi # 7. 启动SSH服务(带详细错误输出) log_message "INFO" "启动SSH服务..." local sshd_path="" if [ -x "/usr/sbin/sshd" ]; then sshd_path="/usr/sbin/sshd" elif [ -x "/usr/bin/sshd" ]; then sshd_path="/usr/bin/sshd" else log_ssh_error "ERROR" "找不到sshd可执行文件" local startup_diag=$(collect_startup_diagnostics) log_ssh_error "ERROR" "启动诊断信息: $startup_diag" send_notification "SSH修复失败" "找不到sshd可执行文件。诊断: $startup_diag" return 1 fi # 尝试启动SSH服务(捕获详细错误) log_message "INFO" "尝试启动SSH服务: $sshd_path" local sshd_output=$(mktemp) if ! $sshd_path 2> "$sshd_output"; then log_ssh_error "ERROR" "SSH服务启动失败,详细错误信息:" cat "$sshd_output" | while read line; do log_ssh_error "ERROR" " $line" done log_ssh_error "ERROR" "SSH守护进程调试信息:" local sshd_debug=$(capture_sshd_errors) log_ssh_error "ERROR" "调试信息已保存到: $sshd_debug" log_ssh_error "ERROR" "系统日志信息:" local system_logs=$(collect_ssh_system_logs) log_ssh_error "ERROR" "系统日志已保存到: $system_logs" send_notification "SSH启动失败" "SSH服务启动失败,详细错误信息已记录到: $SSH_ERROR_LOG" rm -f "$sshd_output" return 1 fi rm -f "$sshd_output" sleep 3 # 8. 验证启动是否成功 if check_ssh_process && check_ssh_port; then log_message "INFO" "SSH服务修复成功" reset_backoff # 显示最近的SSH错误日志(如果有) if [ -f "$SSH_ERROR_LOG" ]; then log_message "INFO" "最近的SSH错误日志:" show_recent_ssh_errors 10 fi send_notification "SSH服务恢复" "SSH服务已成功修复并重新启动" # 报告状态到监控API report_to_monitor_api "ssh_recovered" "SSH service has been successfully recovered" return 0 else log_message "ERROR" "SSH服务修复失败,应用指数退避" apply_backoff send_notification "SSH修复失败" "SSH服务修复失败,已应用退避策略等待重试。诊断信息: $diag_file" # 报告状态到监控API report_to_monitor_api "ssh_repair_failed" "SSH service repair failed, applying backoff" return 1 fi } # 创建最小SSH配置 create_minimal_ssh_config() { log_message "INFO" "创建最小SSH配置..." cat > "$SSH_CONFIG" <> "$LOG_FILE" 2>&1 } # 主监控循环 main_loop() { log_message "INFO" "SSH服务看门狗启动,检查间隔: ${CHECK_INTERVAL}秒" log_message "INFO" "配置: SSH端口=$SSH_PORT, 最大重试次数=$MAX_RETRIES, 通知=$([ "$NOTIFICATION_ENABLED" = "true" ] && echo "启用" || echo "禁用")" log_message "INFO" "重启频率限制: $MAX_RESTARTS_PER_HOUR/小时, 指数退避: ${BACKOFF_BASE}s 基础, ${BACKOFF_MAX}s 最大" local consecutive_failures=0 local total_checks=0 local last_notify_time=0 local notify_interval=3600 # 通知间隔:1小时 # 初始检查:确保SSH配置有备份和校验和 if [ -f "$SSH_CONFIG" ] && sshd -t 2>/dev/null; then cp "$SSH_CONFIG" "$SSH_BACKUP_CONFIG" 2>/dev/null save_config_checksum log_message "INFO" "初始SSH配置备份和校验和已保存" else log_message "WARN" "初始SSH配置检查失败,将在首次修复时创建" fi while true; do total_checks=$((total_checks + 1)) local current_time=$(date +%s) local ssh_status="unknown" # 1. 检查系统资源 if ! check_system_resources; then log_message "WARN" "系统资源不足,可能影响SSH服务稳定性" ssh_status="resource_warning" fi # 2. 检查SSH进程 if ! check_ssh_process; then log_message "ERROR" "SSH进程不存在" consecutive_failures=$((consecutive_failures + 1)) ssh_status="no_process" else # 3. 检查SSH端口 if ! check_ssh_port; then log_message "ERROR" "SSH端口 $SSH_PORT 未监听" consecutive_failures=$((consecutive_failures + 1)) ssh_status="port_not_listening" else # 4. 检查SSH响应 if ! check_ssh_response; then log_message "WARN" "SSH端口监听但无响应" consecutive_failures=$((consecutive_failures + 1)) ssh_status="no_response" else # SSH服务完全正常 ssh_status="healthy" if [ $consecutive_failures -gt 0 ]; then log_message "INFO" "SSH服务恢复正常 (之前连续失败 $consecutive_failures 次)" send_notification "SSH服务恢复" "SSH服务已恢复正常 after $consecutive_failures 次失败" report_to_monitor_api "ssh_healthy" "SSH service is healthy after $consecutive_failures failures" fi consecutive_failures=0 reset_backoff fi fi fi # 5. 如果连续失败次数超过阈值,尝试修复 if [ $consecutive_failures -ge $MAX_RETRIES ]; then log_message "ERROR" "SSH服务连续失败 $consecutive_failures 次,开始修复..." send_notification "SSH服务异常" "SSH服务连续失败 $consecutive_failures 次,开始自动修复" if repair_ssh_service; then consecutive_failures=0 log_message "INFO" "SSH服务修复成功,重置失败计数器" send_notification "SSH修复成功" "SSH服务已成功修复并重新启动" else log_message "ERROR" "SSH服务修复失败,将在下次检查时重试" # 收集详细诊断信息(仅在首次失败或间隔足够长时) if [ $((current_time - last_notify_time)) -gt $notify_interval ]; then local diag_file=$(collect_diagnostics) send_notification "SSH修复失败" "SSH服务修复失败,诊断信息已保存到: $diag_file" last_notify_time=$current_time fi # 不重置计数器,继续尝试 # 如果失败次数过多,尝试更多恢复策略 if [ $consecutive_failures -ge $((MAX_RETRIES * 3)) ]; then log_message "ERROR" "SSH服务多次修复失败(已失败 $consecutive_failures 次),尝试强制恢复..." force_recover_ssh fi fi fi # 6. 记录定期状态(每10个检查周期记录一次) if [ $((total_checks % 10)) -eq 0 ]; then log_message "INFO" "SSH服务状态: $ssh_status, 已连续监控 $total_checks 个周期, 连续失败: $consecutive_failures, 退避: $(( current_backoff - 1 )) 级" report_to_monitor_api "ssh_status" "Status: $ssh_status, Checks: $total_checks, Failures: $consecutive_failures, Backoff: $(( current_backoff - 1 ))" fi # 7. 检查看门狗自身健康状态 if [ $((total_checks % 100)) -eq 0 ]; then log_message "DEBUG" "看门狗运行正常,已执行 $total_checks 次检查,重启计数: $restart_count" # 清理旧的诊断文件(保留最近5个) cleanup_old_diagnostics # 验证配置完整性 verify_config_integrity || log_message "WARN" "配置完整性验证失败" fi sleep $CHECK_INTERVAL done } # 强制恢复SSH服务(更激进的恢复策略) force_recover_ssh() { log_message "WARN" "执行强制恢复策略..." # 0. 检查重启频率 if ! check_restart_rate_limit; then send_notification "SSH强制恢复失败" "重启频率超限,跳过强制恢复" return 1 fi # 记录详细的强制恢复日志 log_ssh_error "WARN" "开始强制恢复SSH服务..." # 1. 杀死所有SSH相关进程 log_message "INFO" "强制终止所有SSH进程..." log_ssh_error "INFO" "强制终止所有SSH进程" killall -9 sshd 2>/dev/null || true sleep 5 # 2. 清理可能损坏的SSH套接字和临时文件 log_message "INFO" "清理SSH相关临时文件..." log_ssh_error "INFO" "清理SSH相关临时文件" rm -f /tmp/ssh-* 2>/dev/null || true rm -f /var/run/sshd.pid 2>/dev/null || true # 3. 检查并创建必要的目录和文件 log_message "INFO" "检查SSH必要目录..." log_ssh_error "INFO" "检查SSH必要目录和权限" mkdir -p /var/run/sshd 2>/dev/null || true mkdir -p /etc/ssh 2>/dev/null || true chmod 755 /var/run/sshd 2>/dev/null || true # 4. 尝试重新安装/修复SSH服务 if command -v apt-get &> /dev/null; then log_message "INFO" "尝试重新配置SSH服务..." log_ssh_error "INFO" "尝试重新配置SSH服务 (dpkg-reconfigure)" dpkg-reconfigure -f noninteractive openssh-server 2>> "$SSH_ERROR_LOG" || true # 如果仍然失败,尝试重新安装 if ! sshd -t 2>/dev/null; then log_message "WARN" "尝试重新安装openssh-server..." log_ssh_error "WARN" "配置测试失败,尝试重新安装openssh-server" apt-get update -qq 2>> "$SSH_ERROR_LOG" || true DEBIAN_FRONTEND=noninteractive apt-get install -y --reinstall openssh-server 2>> "$SSH_ERROR_LOG" || true fi elif command -v yum &> /dev/null; then log_message "INFO" "尝试重新安装openssh-server (yum)..." log_ssh_error "INFO" "尝试重新安装openssh-server (yum)" yum reinstall -y openssh-server 2>> "$SSH_ERROR_LOG" || true fi # 5. 生成SSH主机密钥(如果不存在) log_message "INFO" "检查SSH主机密钥..." log_ssh_error "INFO" "检查并生成SSH主机密钥" if [ ! -f "/etc/ssh/ssh_host_rsa_key" ]; then log_message "INFO" "生成SSH RSA主机密钥..." ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key -N '' 2>> "$SSH_ERROR_LOG" || true fi if [ ! -f "/etc/ssh/ssh_host_ecdsa_key" ]; then log_message "INFO" "生成SSH ECDSA主机密钥..." ssh-keygen -t ecdsa -f /etc/ssh/ssh_host_ecdsa_key -N '' 2>> "$SSH_ERROR_LOG" || true fi if [ ! -f "/etc/ssh/ssh_host_ed25519_key" ]; then log_message "INFO" "生成SSH ED25519主机密钥..." ssh-keygen -t ed25519 -f /etc/ssh/ssh_host_ed25519_key -N '' 2>> "$SSH_ERROR_LOG" || true fi # 6. 使用最保守的配置 log_message "INFO" "使用最保守的SSH配置..." log_ssh_error "INFO" "使用最保守的SSH配置" create_minimal_ssh_config # 7. 验证配置(详细输出) log_ssh_error "INFO" "验证SSH配置(详细模式)" if ! sshd -t 2>> "$SSH_ERROR_LOG"; then log_ssh_error "ERROR" "SSH配置验证失败,详细信息:" local config_error_file=$(test_ssh_config_detailed) log_ssh_error "ERROR" "配置测试详情已保存到: $config_error_file" log_message "ERROR" "SSH配置验证失败,使用默认配置" create_default_ssh_config fi # 8. 尝试启动SSH服务(带详细错误输出) log_message "INFO" "强制启动SSH服务..." log_ssh_error "INFO" "尝试启动SSH服务" local sshd_path="" if [ -x "/usr/sbin/sshd" ]; then sshd_path="/usr/sbin/sshd" elif [ -x "/usr/bin/sshd" ]; then sshd_path="/usr/bin/sshd" else log_ssh_error "ERROR" "找不到sshd可执行文件,尝试重新安装..." local startup_diag=$(collect_startup_diagnostics) log_ssh_error "ERROR" "启动诊断信息: $startup_diag" send_notification "SSH二进制文件丢失" "找不到sshd可执行文件。诊断: $startup_diag" # 尝试重新安装 if command -v apt-get &> /dev/null; then log_ssh_error "INFO" "尝试安装openssh-server" DEBIAN_FRONTEND=noninteractive apt-get install -y openssh-server 2>> "$SSH_ERROR_LOG" || true fi # 再次检查 if [ -x "/usr/sbin/sshd" ]; then sshd_path="/usr/sbin/sshd" elif [ -x "/usr/bin/sshd" ]; then sshd_path="/usr/bin/sshd" else log_ssh_error "ERROR" "重新安装后仍然找不到sshd,无法强制恢复" local system_logs=$(collect_ssh_system_logs) log_ssh_error "ERROR" "系统日志已保存到: $system_logs" return 1 fi fi # 检查二进制文件完整性 if ! check_sshd_binary; then log_ssh_error "ERROR" "sshd二进制文件检查失败" local startup_diag=$(collect_startup_diagnostics) log_ssh_error "ERROR" "启动诊断信息: $startup_diag" send_notification "SSH二进制文件损坏" "sshd二进制文件检查失败。诊断: $startup_diag" fi # 尝试启动SSH服务(捕获详细错误) log_ssh_error "INFO" "执行命令: $sshd_path" local sshd_output=$(mktemp) if ! $sshd_path 2> "$sshd_output"; then log_ssh_error "ERROR" "SSH服务启动失败,详细错误信息:" cat "$sshd_output" | while read line; do log_ssh_error "ERROR" " $line" done log_ssh_error "ERROR" "SSH守护进程调试信息:" local sshd_debug=$(capture_sshd_errors) log_ssh_error "ERROR" "调试信息已保存到: $sshd_debug" log_ssh_error "ERROR" "系统日志信息:" local system_logs=$(collect_ssh_system_logs) log_ssh_error "ERROR" "系统日志已保存到: $system_logs" send_notification "SSH强制启动失败" "SSH服务强制启动失败,详细错误信息已记录到: $SSH_ERROR_LOG" rm -f "$sshd_output" else log_ssh_error "INFO" "SSH服务启动成功(无错误输出)" rm -f "$sshd_output" fi sleep 5 # 9. 验证恢复是否成功 if check_ssh_process && check_ssh_port; then log_message "INFO" "强制恢复成功" log_ssh_error "INFO" "强制恢复成功,SSH服务已正常运行" reset_backoff # 显示最近的SSH错误日志(如果有) if [ -f "$SSH_ERROR_LOG" ]; then log_message "INFO" "最近的SSH错误日志:" show_recent_ssh_errors 10 fi send_notification "SSH强制恢复成功" "SSH服务已通过强制恢复策略成功恢复" # 保存新的配置校验和 save_config_checksum return 0 else log_message "ERROR" "强制恢复失败,应用指数退避" log_ssh_error "ERROR" "强制恢复失败,应用指数退避" apply_backoff local startup_diag=$(collect_startup_diagnostics) log_ssh_error "ERROR" "启动诊断信息: $startup_diag" local system_logs=$(collect_ssh_system_logs) log_ssh_error "ERROR" "系统日志已保存到: $system_logs" send_notification "SSH强制恢复失败" "SSH服务强制恢复失败,详细错误信息已记录到: $SSH_ERROR_LOG" return 1 fi } # 清理旧的诊断文件 cleanup_old_diagnostics() { log_message "DEBUG" "清理旧的诊断文件..." local diag_files=$(ls -t /tmp/ssh_diagnostics_*.log 2>/dev/null | tail -n +6) if [ -n "$diag_files" ]; then echo "$diag_files" | xargs rm -f 2>/dev/null || true log_message "DEBUG" "已清理旧的诊断文件" fi } # 信号处理:优雅退出 cleanup() { log_message "INFO" "接收到终止信号,SSH服务看门狗开始清理..." # 报告看门狗停止到监控API report_to_monitor_api "watchdog_stopping" "SSH watchdog is stopping" # 可选:在退出前确保SSH服务仍在运行 if ! check_ssh_process; then log_message "WARN" "退出前检测到SSH服务未运行,尝试最后一次启动..." if [ -x "/usr/sbin/sshd" ]; then /usr/sbin/sshd 2>> "$LOG_FILE" elif [ -x "/usr/bin/sshd" ]; then /usr/bin/sshd 2>> "$LOG_FILE" fi fi log_message "INFO" "SSH服务看门狗已退出" exit 0 } # 显示帮助信息 show_usage() { cat <&2 exit 1 fi # 创建日志文件目录 mkdir -p "$(dirname "$LOG_FILE")" mkdir -p "$(dirname "$SSH_ERROR_LOG")" touch "$LOG_FILE" touch "$SSH_ERROR_LOG" # 报告看门狗启动到监控API report_to_monitor_api "watchdog_started" "SSH watchdog started with check interval ${CHECK_INTERVAL}s" # 启动主循环 log_message "INFO" "==========================================" log_message "INFO" "SSH服务看门狗即将启动主监控循环" log_message "INFO" "==========================================" log_message "INFO" "使用 '$0 --help' 查看可用命令" log_message "INFO" "使用 '$0 --show-errors' 查看SSH错误日志" main_loop