| #!/bin/bash |
| |
| |
| |
|
|
| |
| SSH_PORT=${SSH_PORT:-22} |
| CHECK_INTERVAL=${CHECK_INTERVAL:-30} |
| MAX_RETRIES=${MAX_RETRIES:-3} |
| LOG_FILE="/var/log/ssh_watchdog.log" |
| SSH_ERROR_LOG="/var/log/ssh_error.log" |
| MAX_LOG_SIZE=10485760 |
| SSH_CONFIG="/etc/ssh/sshd_config" |
| SSH_BACKUP_CONFIG="/etc/ssh/sshd_config.backup" |
| SSH_CONFIG_CHECKSUM="/etc/ssh/sshd_config.checksum" |
| NOTIFICATION_ENABLED=${NOTIFICATION_ENABLED:-false} |
| NOTIFICATION_WEBHOOK=${NOTIFICATION_WEBHOOK:-} |
| MONITOR_API_URL=${MONITOR_API_URL:-"http://localhost:7680/api/terminal"} |
|
|
| |
| MAX_RESTARTS_PER_HOUR=${MAX_RESTARTS_PER_HOUR:-10} |
| restart_count=0 |
| last_restart_hour=$(date +%Y%m%d%H) |
|
|
| |
| BACKOFF_BASE=${BACKOFF_BASE:-2} |
| BACKOFF_MAX=${BACKOFF_MAX:-300} |
| current_backoff=1 |
|
|
| |
| log_ssh_error() { |
| local level=$1 |
| local message=$2 |
| local timestamp=$(date '+%Y-%m-%d %H:%M:%S') |
| local error_file="$SSH_ERROR_LOG" |
| |
| |
| mkdir -p "$(dirname "$error_file")" |
| touch "$error_file" |
| |
| |
| echo "[$timestamp] [$level] $message" >> "$error_file" |
| |
| |
| log_message "$level" "$message" |
| } |
|
|
| |
| RED='\033[0;31m' |
| GREEN='\033[0;32m' |
| YELLOW='\033[1;33m' |
| BLUE='\033[0;34m' |
| NC='\033[0m' |
|
|
| |
| mkdir -p "$(dirname "$LOG_FILE")" |
| touch "$LOG_FILE" |
|
|
| |
| |
| calculate_config_checksum() { |
| sha256sum "$SSH_CONFIG" 2>/dev/null | awk '{print $1}' |
| } |
|
|
| |
| save_config_checksum() { |
| local checksum=$(calculate_config_checksum) |
| if [ -n "$checksum" ]; then |
| echo "$checksum" > "$SSH_CONFIG_CHECKSUM" |
| log_message "DEBUG" "配置文件校验和已保存: $checksum" |
| fi |
| } |
|
|
| |
| verify_config_integrity() { |
| if [ ! -f "$SSH_CONFIG_CHECKSUM" ]; then |
| log_message "WARN" "配置文件校验和不存在,跳过完整性检查" |
| save_config_checksum |
| return 0 |
| fi |
| |
| local current_checksum=$(calculate_config_checksum) |
| local saved_checksum=$(cat "$SSH_CONFIG_CHECKSUM" 2>/dev/null) |
| |
| if [ "$current_checksum" != "$saved_checksum" ]; then |
| log_message "ERROR" "配置文件可能被篡改!校验和不匹配" |
| log_message "ERROR" "期望: $saved_checksum" |
| log_message "ERROR" "当前: $current_checksum" |
| return 1 |
| fi |
| |
| return 0 |
| } |
|
|
| |
| |
| check_restart_rate_limit() { |
| local current_hour=$(date +%Y%m%d%H) |
| |
| |
| if [ "$current_hour" != "$last_restart_hour" ]; then |
| restart_count=0 |
| last_restart_hour=$current_hour |
| fi |
| |
| if [ $restart_count -ge $MAX_RESTARTS_PER_HOUR ]; then |
| log_message "ERROR" "重启频率超限!过去1小时内已重启 $restart_count 次(最大 $MAX_RESTARTS_PER_HOUR 次)" |
| return 1 |
| fi |
| |
| restart_count=$((restart_count + 1)) |
| log_message "DEBUG" "重启计数: $restart_count/$MAX_RESTARTS_PER_HOUR (小时: $current_hour)" |
| return 0 |
| } |
|
|
| |
| |
| calculate_backoff() { |
| local backoff=$(( BACKOFF_BASE ** current_backoff )) |
| if [ $backoff -gt $BACKOFF_MAX ]; then |
| backoff=$BACKOFF_MAX |
| fi |
| echo $backoff |
| } |
|
|
| |
| apply_backoff() { |
| local backoff_time=$(calculate_backoff) |
| log_message "WARN" "应用指数退避:等待 ${backoff_time} 秒后重试..." |
| sleep $backoff_time |
| current_backoff=$((current_backoff + 1)) |
| } |
|
|
| |
| reset_backoff() { |
| if [ $current_backoff -gt 1 ]; then |
| log_message "INFO" "恢复成功,重置退避计数器" |
| fi |
| current_backoff=1 |
| } |
|
|
| |
| |
| capture_sshd_errors() { |
| local error_output_file="/tmp/sshd_error_output_$(date +%Y%m%d_%H%M%S).log" |
| |
| log_ssh_error "INFO" "正在捕获SSH守护进程的详细错误信息..." |
| |
| { |
| echo "=== SSH守护进程错误捕获 ===" |
| echo "捕获时间: $(date '+%Y-%m-%d %H:%M:%S')" |
| echo "" |
| |
| echo "=== 1. SSH配置测试结果 ===" |
| sshd -t 2>&1 | tee -a "$error_output_file" |
| echo "" |
| |
| echo "=== 2. SSH守护进程调试输出(前10行)===" |
| |
| timeout 3 sshd -d -D 2>&1 | head -10 | tee -a "$error_output_file" |
| echo "" |
| |
| echo "=== 3. SSH配置文件的语法检查 ===" |
| sshd -T 2>&1 | head -50 | tee -a "$error_output_file" |
| echo "" |
| |
| } >> "$error_output_file" 2>&1 |
| |
| log_ssh_error "INFO" "SSH错误详情已保存到: $error_output_file" |
| echo "$error_output_file" |
| } |
|
|
| |
| collect_ssh_system_logs() { |
| local log_file="/tmp/ssh_system_logs_$(date +%Y%m%d_%H%M%S).log" |
| |
| log_ssh_error "INFO" "正在收集SSH相关的系统日志..." |
| |
| { |
| echo "=== SSH系统日志收集 ===" |
| echo "收集时间: $(date '+%Y-%m-%d %H:%M:%S')" |
| echo "" |
| |
| echo "=== 1. 系统日志中的SSH相关记录(最近50行)===" |
| if command -v journalctl &> /dev/null; then |
| journalctl -u ssh --no-pager -n 50 2>&1 || echo "无法获取journalctl日志" |
| elif [ -f "/var/log/auth.log" ]; then |
| tail -50 /var/log/auth.log 2>&1 | grep -i ssh || echo "无SSH相关日志" |
| elif [ -f "/var/log/secure" ]; then |
| tail -50 /var/log/secure 2>&1 | grep -i ssh || echo "无SSH相关日志" |
| else |
| echo "未找到SSH日志文件" |
| fi |
| echo "" |
| |
| echo "=== 2. SSH进程当前状态 ===" |
| ps aux | grep sshd | grep -v grep || echo "无SSH进程运行" |
| echo "" |
| |
| echo "=== 3. 网络连接状态(SSH端口)===" |
| if command -v netstat &> /dev/null; then |
| netstat -tulnp 2>&1 | grep ":$SSH_PORT " || echo "端口 $SSH_PORT 无监听" |
| elif command -v ss &> /dev/null; then |
| ss -tulnp 2>&1 | grep ":$SSH_PORT " || echo "端口 $SSH_PORT 无监听" |
| fi |
| echo "" |
| |
| echo "=== 4. 最近的系统消息(dmesg)===" |
| if command -v dmesg &> /dev/null; then |
| dmesg | tail -50 2>&1 | grep -i "ssh\|network\|connection" || echo "无相关内核日志" |
| fi |
| echo "" |
| |
| echo "=== 5. SSH配置文件权限检查 ===" |
| ls -la /etc/ssh/ 2>&1 || echo "无法检查/etc/ssh权限" |
| echo "" |
| |
| echo "=== 6. 文件系统状态 ===" |
| df -h /etc/ssh 2>&1 || echo "无法检查文件系统" |
| echo "" |
| |
| } >> "$log_file" 2>&1 |
| |
| log_ssh_error "INFO" "SSH系统日志已保存到: $log_file" |
| echo "$log_file" |
| } |
|
|
| |
| show_recent_ssh_errors() { |
| local lines=${1:-50} |
| |
| echo "=== 最近的SSH错误日志(最后 $lines 行)===" |
| echo "" |
| |
| if [ -f "$SSH_ERROR_LOG" ]; then |
| tail -n "$lines" "$SSH_ERROR_LOG" |
| else |
| echo "SSH错误日志文件不存在: $SSH_ERROR_LOG" |
| fi |
| |
| echo "" |
| echo "=== 最近的系统日志中的SSH错误 ===" |
| if command -v journalctl &> /dev/null; then |
| journalctl -u ssh --no-pager -n "$lines" 2>&1 | grep -i "error\|fail\|denied" || echo "无错误日志" |
| elif [ -f "/var/log/auth.log" ]; then |
| tail -n "$lines" /var/log/auth.log 2>&1 | grep -i "error\|fail\|denied" || echo "无错误日志" |
| fi |
| } |
|
|
| |
| test_ssh_config_detailed() { |
| local error_file="/tmp/ssh_config_test_$(date +%Y%m%d_%H%M%S).log" |
| |
| log_ssh_error "INFO" "正在详细测试SSH配置文件..." |
| |
| { |
| echo "=== SSH配置详细测试 ===" |
| echo "测试时间: $(date '+%Y-%m-%d %H:%M:%S')" |
| echo "" |
| |
| echo "=== 1. 配置语法检查 ===" |
| sshd -t 2>&1 |
| local syntax_result=$? |
| echo "语法检查结果: $syntax_result" |
| echo "" |
| |
| if [ $syntax_result -ne 0 ]; then |
| echo "=== 2. 配置详细信息(sshd -T)===" |
| sshd -T 2>&1 | head -100 |
| echo "" |
| |
| echo "=== 3. 配置文件内容(敏感信息已隐藏)===" |
| if [ -f "$SSH_CONFIG" ]; then |
| grep -v "^#" "$SSH_CONFIG" | grep -v "^$" | sed 's/PasswordAuthentication.*/PasswordAuthentication [HIDDEN]/g' | head -50 |
| fi |
| echo "" |
| fi |
| |
| } > "$error_file" 2>&1 |
| |
| local errors=$(cat "$error_file") |
| log_ssh_error "ERROR" "SSH配置测试发现错误:" |
| log_ssh_error "ERROR" "$errors" |
| |
| echo "$error_file" |
| } |
|
|
| |
| |
| check_sshd_binary() { |
| local sshd_path="" |
| |
| if [ -x "/usr/sbin/sshd" ]; then |
| sshd_path="/usr/sbin/sshd" |
| elif [ -x "/usr/bin/sshd" ]; then |
| sshd_path="/usr/bin/sshd" |
| else |
| log_message "ERROR" "找不到sshd可执行文件" |
| return 1 |
| fi |
| |
| |
| if [ ! -x "$sshd_path" ]; then |
| log_message "ERROR" "sshd二进制文件不可执行: $sshd_path" |
| return 1 |
| fi |
| |
| |
| if command -v ldd &> /dev/null; then |
| local missing_libs=$(ldd "$sshd_path" 2>&1 | grep "not found" || true) |
| if [ -n "$missing_libs" ]; then |
| log_message "ERROR" "sshd依赖库缺失:" |
| echo "$missing_libs" | while read line; do |
| log_message "ERROR" " $line" |
| done |
| return 1 |
| fi |
| fi |
| |
| |
| if ! "$sshd_path" -t 2>&1; then |
| log_message "ERROR" "sshd配置测试失败" |
| return 1 |
| fi |
| |
| log_message "DEBUG" "sshd二进制文件检查通过: $sshd_path" |
| return 0 |
| } |
|
|
| |
| |
| collect_startup_diagnostics() { |
| local diag_file="/tmp/ssh_startup_diagnostics_$(date +%Y%m%d_%H%M%S).log" |
| |
| log_message "INFO" "收集启动诊断信息: $diag_file" |
| |
| { |
| echo "=== SSH启动诊断报告 ===" |
| echo "时间: $(date '+%Y-%m-%d %H:%M:%S')" |
| echo "" |
| |
| echo "=== SSH二进制文件 ===" |
| which sshd 2>/dev/null || echo "sshd未找到在PATH中" |
| ls -la /usr/sbin/sshd /usr/bin/sshd 2>/dev/null || echo "sshd二进制文件不存在" |
| file /usr/sbin/sshd /usr/bin/sshd 2>/dev/null || echo "无法检查文件类型" |
| echo "" |
| |
| echo "=== 依赖库检查 ===" |
| if command -v ldd &> /dev/null; then |
| ldd /usr/sbin/sshd 2>/dev/null || ldd /usr/bin/sshd 2>/dev/null || echo "无法检查依赖库" |
| fi |
| echo "" |
| |
| echo "=== 配置文件检查 ===" |
| if [ -f "$SSH_CONFIG" ]; then |
| echo "配置文件存在: $SSH_CONFIG" |
| echo "配置文件权限: $(ls -la "$SSH_CONFIG")" |
| echo "配置文件大小: $(stat -c%s "$SSH_CONFIG" 2>/dev/null || echo '未知') 字节" |
| echo "" |
| echo "=== 配置内容(敏感信息已隐藏)===" |
| grep -v "^#" "$SSH_CONFIG" | grep -v "^$" | sed 's/PasswordAuthentication.*/PasswordAuthentication [HIDDEN]/g' | head -50 |
| else |
| echo "配置文件不存在: $SSH_CONFIG" |
| fi |
| echo "" |
| |
| echo "=== 端口占用检查 ===" |
| if command -v netstat &> /dev/null; then |
| netstat -tulnp 2>/dev/null | grep ":$SSH_PORT " || echo "端口 $SSH_PORT 未被占用" |
| elif command -v ss &> /dev/null; then |
| ss -tulnp 2>/dev/null | grep ":$SSH_PORT " || echo "端口 $SSH_PORT 未被占用" |
| fi |
| echo "" |
| |
| echo "=== 最近的错误日志 ===" |
| if [ -f "/var/log/auth.log" ]; then |
| tail -50 /var/log/auth.log 2>/dev/null | grep -i ssh || echo "无SSH相关日志" |
| elif [ -f "/var/log/secure" ]; then |
| tail -50 /var/log/secure 2>/dev/null | grep -i ssh || echo "无SSH相关日志" |
| elif command -v journalctl &> /dev/null; then |
| journalctl -u ssh --no-pager -n 50 2>/dev/null || echo "无法获取journalctl日志" |
| fi |
| echo "" |
| |
| echo "=== 文件系统检查 ===" |
| df -h /etc/ssh 2>/dev/null || echo "无法检查文件系统" |
| echo "" |
| |
| echo "=== 权限检查 ===" |
| ls -la /etc/ssh/ 2>/dev/null || echo "无法检查/etc/ssh权限" |
| ls -la /var/run/ 2>/dev/null | grep ssh || echo "无法检查/var/run/ssh" |
| echo "" |
| |
| } > "$diag_file" 2>&1 |
| |
| log_message "INFO" "启动诊断信息已保存: $diag_file" |
| echo "$diag_file" |
| } |
|
|
| |
| log_message() { |
| local level=$1 |
| local message=$2 |
| local timestamp=$(date '+%Y-%m-%d %H:%M:%S') |
| |
| |
| if [ -f "$LOG_FILE" ]; then |
| local file_size=$(stat -c%s "$LOG_FILE" 2>/dev/null || echo "0") |
| if [ "$file_size" -gt $MAX_LOG_SIZE ]; then |
| mv "$LOG_FILE" "${LOG_FILE}.$(date +%Y%m%d_%H%M%S)" 2>/dev/null |
| touch "$LOG_FILE" |
| fi |
| fi |
| |
| |
| printf "[%s] [%s] %s\n" "$timestamp" "$level" "$message" >> "$LOG_FILE" |
| |
| |
| if [ -t 1 ]; then |
| |
| case $level in |
| "INFO") |
| printf "${GREEN}[%s] [INFO] %s${NC}\n" "$timestamp" "$message" >&1 |
| ;; |
| "WARN") |
| printf "${YELLOW}[%s] [WARN] %s${NC}\n" "$timestamp" "$message" >&1 |
| ;; |
| "ERROR") |
| printf "${RED}[%s] [ERROR] %s${NC}\n" "$timestamp" "$message" >&2 |
| ;; |
| "DEBUG") |
| printf "${BLUE}[%s] [DEBUG] %s${NC}\n" "$timestamp" "$message" >&1 |
| ;; |
| *) |
| printf "[%s] [%s] %s\n" "$timestamp" "$level" "$message" >&1 |
| ;; |
| esac |
| else |
| |
| if [ "$level" = "ERROR" ]; then |
| printf "[%s] [%s] %s\n" "$timestamp" "$level" "$message" >&2 |
| else |
| printf "[%s] [%s] %s\n" "$timestamp" "$level" "$message" >&1 |
| fi |
| fi |
| } |
|
|
| |
| send_notification() { |
| local subject=$1 |
| local message=$2 |
| |
| if [ "$NOTIFICATION_ENABLED" != "true" ]; then |
| return 0 |
| fi |
| |
| log_message "INFO" "发送通知: $subject" |
| |
| |
| if [ -n "$NOTIFICATION_WEBHOOK" ]; then |
| local payload=$(cat <<EOF |
| { |
| "msgtype": "text", |
| "text": { |
| "content": "SSH看门狗告警\n标题: $subject\n详情: $message\n时间: $(date '+%Y-%m-%d %H:%M:%S')" |
| } |
| } |
| EOF |
| ) |
| curl -s -X POST "$NOTIFICATION_WEBHOOK" \ |
| -H 'Content-Type: application/json' \ |
| -d "$payload" >> "$LOG_FILE" 2>&1 |
| fi |
| |
| |
| } |
|
|
| |
| collect_diagnostics() { |
| local diag_file="/tmp/ssh_diagnostics_$(date +%Y%m%d_%H%M%S).log" |
| |
| log_message "INFO" "收集诊断信息到: $diag_file" |
| |
| { |
| echo "=== SSH服务诊断报告 ===" |
| echo "收集时间: $(date '+%Y-%m-%d %H:%M:%S')" |
| echo "" |
| |
| echo "=== 系统信息 ===" |
| uname -a |
| echo "" |
| |
| echo "=== SSH进程状态 ===" |
| ps aux | grep sshd | grep -v grep || echo "无SSH进程运行" |
| echo "" |
| |
| echo "=== 网络连接状态 ===" |
| netstat -tulnp 2>/dev/null | grep :$SSH_PORT || echo "端口 $SSH_PORT 无监听" |
| echo "" |
| |
| echo "=== SSH配置检查 ===" |
| if [ -f "$SSH_CONFIG" ]; then |
| sshd -t 2>&1 || echo "SSH配置文件有错误" |
| else |
| echo "SSH配置文件不存在: $SSH_CONFIG" |
| fi |
| echo "" |
| |
| echo "=== 系统资源状态 ===" |
| echo "内存使用:" |
| free -h |
| echo "" |
| echo "磁盘使用:" |
| df -h |
| echo "" |
| echo "系统负载:" |
| uptime |
| echo "" |
| |
| echo "=== 最近的系统日志(SSH相关)===" |
| if command -v journalctl &> /dev/null; then |
| journalctl -u ssh --no-pager -n 50 2>/dev/null || echo "无法获取journalctl日志" |
| else |
| tail -100 /var/log/auth.log 2>/dev/null || tail -100 /var/log/secure 2>/dev/null || echo "无法获取SSH日志" |
| fi |
| echo "" |
| |
| echo "=== 防火墙状态 ===" |
| if command -v iptables &> /dev/null; then |
| iptables -L -n 2>/dev/null | head -50 |
| fi |
| if command -v ufw &> /dev/null; then |
| ufw status 2>/dev/null |
| fi |
| echo "" |
| |
| } > "$diag_file" 2>&1 |
| |
| log_message "INFO" "诊断信息已保存到: $diag_file" |
| echo "$diag_file" |
| } |
|
|
| |
| check_ssh_process() { |
| if pgrep -x "sshd" > /dev/null; then |
| return 0 |
| else |
| return 1 |
| fi |
| } |
|
|
| |
| check_ssh_port() { |
| if netstat -tuln | grep -q ":$SSH_PORT "; then |
| return 0 |
| else |
| return 1 |
| fi |
| } |
|
|
| |
| check_ssh_response() { |
| |
| timeout 5 bash -c "</dev/tcp/localhost/$SSH_PORT" 2>/dev/null |
| return $? |
| } |
|
|
| |
| check_system_resources() { |
| local memory_usage=$(free | grep Mem | awk '{print ($3/$2) * 100}' | cut -d. -f1) |
| local disk_usage=$(df / | tail -1 | awk '{print $5}' | sed 's/%//g') |
| |
| if [ "$memory_usage" -gt 90 ]; then |
| log_message "WARN" "内存使用率过高: ${memory_usage}%" |
| return 1 |
| fi |
| |
| if [ "$disk_usage" -gt 90 ]; then |
| log_message "WARN" "磁盘使用率过高: ${disk_usage}%" |
| return 1 |
| fi |
| |
| return 0 |
| } |
|
|
| |
| repair_ssh_service() { |
| log_message "INFO" "开始尝试修复SSH服务..." |
| |
| |
| if ! check_restart_rate_limit; then |
| send_notification "SSH重启频率超限" "过去1小时内已尝试重启 $MAX_RESTARTS_PER_HOUR 次,暂停自动重启" |
| return 1 |
| fi |
| |
| |
| local diag_file=$(collect_diagnostics) |
| send_notification "SSH服务异常" "SSH服务检测到异常,开始尝试修复。诊断信息: $diag_file" |
| |
| |
| log_message "INFO" "停止现有的SSH服务..." |
| killall sshd 2>/dev/null |
| sleep 2 |
| |
| |
| log_message "INFO" "检查SSH二进制文件完整性..." |
| if ! check_sshd_binary; then |
| log_message "ERROR" "SSH二进制文件检查失败,尝试收集更多信息..." |
| local startup_diag=$(collect_startup_diagnostics) |
| send_notification "SSH二进制文件损坏" "SSH二进制文件检查失败,诊断信息: $startup_diag" |
| fi |
| |
| |
| log_message "INFO" "验证SSH配置完整性..." |
| if ! verify_config_integrity; then |
| log_ssh_error "WARN" "配置完整性验证失败,使用备份配置" |
| if [ -f "$SSH_BACKUP_CONFIG" ]; then |
| cp "$SSH_BACKUP_CONFIG" "$SSH_CONFIG" |
| log_message "INFO" "已恢复备份配置" |
| fi |
| fi |
| |
| |
| log_message "INFO" "检查SSH配置文件..." |
| if ! sshd -t 2>> "$SSH_ERROR_LOG"; then |
| log_ssh_error "ERROR" "SSH配置文件有错误,详细信息:" |
| local config_error_file=$(test_ssh_config_detailed) |
| log_ssh_error "ERROR" "配置文件测试详情已保存到: $config_error_file" |
| |
| |
| if [ -f "$SSH_BACKUP_CONFIG" ]; then |
| log_message "INFO" "使用备份配置文件: $SSH_BACKUP_CONFIG" |
| cp "$SSH_BACKUP_CONFIG" "$SSH_CONFIG" |
| else |
| log_message "WARN" "备份配置不存在,创建最小配置" |
| create_minimal_ssh_config |
| fi |
| |
| |
| if ! sshd -t 2>> "$SSH_ERROR_LOG"; then |
| log_ssh_error "ERROR" "配置修复失败,使用默认配置" |
| create_default_ssh_config |
| fi |
| fi |
| |
| |
| if sshd -t 2>/dev/null; then |
| cp "$SSH_CONFIG" "$SSH_BACKUP_CONFIG" 2>/dev/null |
| save_config_checksum |
| log_message "INFO" "SSH配置已备份到: $SSH_BACKUP_CONFIG" |
| fi |
| |
| |
| log_message "INFO" "启动SSH服务..." |
| local sshd_path="" |
| if [ -x "/usr/sbin/sshd" ]; then |
| sshd_path="/usr/sbin/sshd" |
| elif [ -x "/usr/bin/sshd" ]; then |
| sshd_path="/usr/bin/sshd" |
| else |
| log_ssh_error "ERROR" "找不到sshd可执行文件" |
| local startup_diag=$(collect_startup_diagnostics) |
| log_ssh_error "ERROR" "启动诊断信息: $startup_diag" |
| send_notification "SSH修复失败" "找不到sshd可执行文件。诊断: $startup_diag" |
| return 1 |
| fi |
| |
| |
| log_message "INFO" "尝试启动SSH服务: $sshd_path" |
| local sshd_output=$(mktemp) |
| if ! $sshd_path 2> "$sshd_output"; then |
| log_ssh_error "ERROR" "SSH服务启动失败,详细错误信息:" |
| cat "$sshd_output" | while read line; do |
| log_ssh_error "ERROR" " $line" |
| done |
| |
| log_ssh_error "ERROR" "SSH守护进程调试信息:" |
| local sshd_debug=$(capture_sshd_errors) |
| log_ssh_error "ERROR" "调试信息已保存到: $sshd_debug" |
| |
| log_ssh_error "ERROR" "系统日志信息:" |
| local system_logs=$(collect_ssh_system_logs) |
| log_ssh_error "ERROR" "系统日志已保存到: $system_logs" |
| |
| send_notification "SSH启动失败" "SSH服务启动失败,详细错误信息已记录到: $SSH_ERROR_LOG" |
| rm -f "$sshd_output" |
| return 1 |
| fi |
| rm -f "$sshd_output" |
| |
| sleep 3 |
| |
| |
| if check_ssh_process && check_ssh_port; then |
| log_message "INFO" "SSH服务修复成功" |
| reset_backoff |
| |
| |
| if [ -f "$SSH_ERROR_LOG" ]; then |
| log_message "INFO" "最近的SSH错误日志:" |
| show_recent_ssh_errors 10 |
| fi |
| |
| send_notification "SSH服务恢复" "SSH服务已成功修复并重新启动" |
| |
| |
| report_to_monitor_api "ssh_recovered" "SSH service has been successfully recovered" |
| |
| return 0 |
| else |
| log_message "ERROR" "SSH服务修复失败,应用指数退避" |
| apply_backoff |
| send_notification "SSH修复失败" "SSH服务修复失败,已应用退避策略等待重试。诊断信息: $diag_file" |
| |
| |
| report_to_monitor_api "ssh_repair_failed" "SSH service repair failed, applying backoff" |
| |
| return 1 |
| fi |
| } |
|
|
| |
| create_minimal_ssh_config() { |
| log_message "INFO" "创建最小SSH配置..." |
| cat > "$SSH_CONFIG" <<EOF |
| # 最小SSH配置(由看门狗自动生成) |
| Port $SSH_PORT |
| AddressFamily any |
| ListenAddress 0.0.0.0 |
| ListenAddress :: |
| PermitRootLogin yes |
| PasswordAuthentication yes |
| PubkeyAuthentication yes |
| AuthorizedKeysFile .ssh/authorized_keys |
| UsePAM yes |
| PrintMotd no |
| AcceptEnv LANG LC_* |
| Subsystem sftp /usr/lib/openssh/sftp-server |
| EOF |
| log_message "INFO" "最小SSH配置已创建: $SSH_CONFIG" |
| } |
|
|
| |
| create_default_ssh_config() { |
| log_message "WARN" "创建默认SSH配置..." |
| |
| if [ -f "/etc/ssh/sshd_config.default" ]; then |
| cp "/etc/ssh/sshd_config.default" "$SSH_CONFIG" |
| else |
| |
| create_minimal_ssh_config |
| fi |
| } |
|
|
| |
| report_to_monitor_api() { |
| local status=$1 |
| local message=$2 |
| |
| if [ -z "$MONITOR_API_URL" ]; then |
| return 0 |
| fi |
| |
| log_message "DEBUG" "报告状态到监控API: $status" |
| |
| local payload=$(cat <<EOF |
| { |
| "event": "$status", |
| "message": "$message", |
| "timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)", |
| "service": "ssh_watchdog" |
| } |
| EOF |
| ) |
| |
| curl -s -X POST "$MONITOR_API_URL/events" \ |
| -H 'Content-Type: application/json' \ |
| -d "$payload" >> "$LOG_FILE" 2>&1 |
| } |
|
|
| |
| main_loop() { |
| log_message "INFO" "SSH服务看门狗启动,检查间隔: ${CHECK_INTERVAL}秒" |
| log_message "INFO" "配置: SSH端口=$SSH_PORT, 最大重试次数=$MAX_RETRIES, 通知=$([ "$NOTIFICATION_ENABLED" = "true" ] && echo "启用" || echo "禁用")" |
| log_message "INFO" "重启频率限制: $MAX_RESTARTS_PER_HOUR/小时, 指数退避: ${BACKOFF_BASE}s 基础, ${BACKOFF_MAX}s 最大" |
| |
| local consecutive_failures=0 |
| local total_checks=0 |
| local last_notify_time=0 |
| local notify_interval=3600 |
| |
| |
| if [ -f "$SSH_CONFIG" ] && sshd -t 2>/dev/null; then |
| cp "$SSH_CONFIG" "$SSH_BACKUP_CONFIG" 2>/dev/null |
| save_config_checksum |
| log_message "INFO" "初始SSH配置备份和校验和已保存" |
| else |
| log_message "WARN" "初始SSH配置检查失败,将在首次修复时创建" |
| fi |
| |
| while true; do |
| total_checks=$((total_checks + 1)) |
| local current_time=$(date +%s) |
| local ssh_status="unknown" |
| |
| |
| if ! check_system_resources; then |
| log_message "WARN" "系统资源不足,可能影响SSH服务稳定性" |
| ssh_status="resource_warning" |
| fi |
| |
| |
| if ! check_ssh_process; then |
| log_message "ERROR" "SSH进程不存在" |
| consecutive_failures=$((consecutive_failures + 1)) |
| ssh_status="no_process" |
| else |
| |
| if ! check_ssh_port; then |
| log_message "ERROR" "SSH端口 $SSH_PORT 未监听" |
| consecutive_failures=$((consecutive_failures + 1)) |
| ssh_status="port_not_listening" |
| else |
| |
| if ! check_ssh_response; then |
| log_message "WARN" "SSH端口监听但无响应" |
| consecutive_failures=$((consecutive_failures + 1)) |
| ssh_status="no_response" |
| else |
| |
| ssh_status="healthy" |
| if [ $consecutive_failures -gt 0 ]; then |
| log_message "INFO" "SSH服务恢复正常 (之前连续失败 $consecutive_failures 次)" |
| send_notification "SSH服务恢复" "SSH服务已恢复正常 after $consecutive_failures 次失败" |
| report_to_monitor_api "ssh_healthy" "SSH service is healthy after $consecutive_failures failures" |
| fi |
| consecutive_failures=0 |
| reset_backoff |
| fi |
| fi |
| fi |
| |
| |
| if [ $consecutive_failures -ge $MAX_RETRIES ]; then |
| log_message "ERROR" "SSH服务连续失败 $consecutive_failures 次,开始修复..." |
| send_notification "SSH服务异常" "SSH服务连续失败 $consecutive_failures 次,开始自动修复" |
| |
| if repair_ssh_service; then |
| consecutive_failures=0 |
| log_message "INFO" "SSH服务修复成功,重置失败计数器" |
| send_notification "SSH修复成功" "SSH服务已成功修复并重新启动" |
| else |
| log_message "ERROR" "SSH服务修复失败,将在下次检查时重试" |
| |
| |
| if [ $((current_time - last_notify_time)) -gt $notify_interval ]; then |
| local diag_file=$(collect_diagnostics) |
| send_notification "SSH修复失败" "SSH服务修复失败,诊断信息已保存到: $diag_file" |
| last_notify_time=$current_time |
| fi |
| |
| |
| |
| if [ $consecutive_failures -ge $((MAX_RETRIES * 3)) ]; then |
| log_message "ERROR" "SSH服务多次修复失败(已失败 $consecutive_failures 次),尝试强制恢复..." |
| force_recover_ssh |
| fi |
| fi |
| fi |
| |
| |
| if [ $((total_checks % 10)) -eq 0 ]; then |
| log_message "INFO" "SSH服务状态: $ssh_status, 已连续监控 $total_checks 个周期, 连续失败: $consecutive_failures, 退避: $(( current_backoff - 1 )) 级" |
| report_to_monitor_api "ssh_status" "Status: $ssh_status, Checks: $total_checks, Failures: $consecutive_failures, Backoff: $(( current_backoff - 1 ))" |
| fi |
| |
| |
| if [ $((total_checks % 100)) -eq 0 ]; then |
| log_message "DEBUG" "看门狗运行正常,已执行 $total_checks 次检查,重启计数: $restart_count" |
| |
| cleanup_old_diagnostics |
| |
| verify_config_integrity || log_message "WARN" "配置完整性验证失败" |
| fi |
| |
| sleep $CHECK_INTERVAL |
| done |
| } |
|
|
| |
| force_recover_ssh() { |
| log_message "WARN" "执行强制恢复策略..." |
| |
| |
| if ! check_restart_rate_limit; then |
| send_notification "SSH强制恢复失败" "重启频率超限,跳过强制恢复" |
| return 1 |
| fi |
| |
| |
| log_ssh_error "WARN" "开始强制恢复SSH服务..." |
| |
| |
| log_message "INFO" "强制终止所有SSH进程..." |
| log_ssh_error "INFO" "强制终止所有SSH进程" |
| killall -9 sshd 2>/dev/null || true |
| sleep 5 |
| |
| |
| log_message "INFO" "清理SSH相关临时文件..." |
| log_ssh_error "INFO" "清理SSH相关临时文件" |
| rm -f /tmp/ssh-* 2>/dev/null || true |
| rm -f /var/run/sshd.pid 2>/dev/null || true |
| |
| |
| log_message "INFO" "检查SSH必要目录..." |
| log_ssh_error "INFO" "检查SSH必要目录和权限" |
| mkdir -p /var/run/sshd 2>/dev/null || true |
| mkdir -p /etc/ssh 2>/dev/null || true |
| chmod 755 /var/run/sshd 2>/dev/null || true |
| |
| |
| if command -v apt-get &> /dev/null; then |
| log_message "INFO" "尝试重新配置SSH服务..." |
| log_ssh_error "INFO" "尝试重新配置SSH服务 (dpkg-reconfigure)" |
| dpkg-reconfigure -f noninteractive openssh-server 2>> "$SSH_ERROR_LOG" || true |
| |
| |
| if ! sshd -t 2>/dev/null; then |
| log_message "WARN" "尝试重新安装openssh-server..." |
| log_ssh_error "WARN" "配置测试失败,尝试重新安装openssh-server" |
| apt-get update -qq 2>> "$SSH_ERROR_LOG" || true |
| DEBIAN_FRONTEND=noninteractive apt-get install -y --reinstall openssh-server 2>> "$SSH_ERROR_LOG" || true |
| fi |
| elif command -v yum &> /dev/null; then |
| log_message "INFO" "尝试重新安装openssh-server (yum)..." |
| log_ssh_error "INFO" "尝试重新安装openssh-server (yum)" |
| yum reinstall -y openssh-server 2>> "$SSH_ERROR_LOG" || true |
| fi |
| |
| |
| log_message "INFO" "检查SSH主机密钥..." |
| log_ssh_error "INFO" "检查并生成SSH主机密钥" |
| if [ ! -f "/etc/ssh/ssh_host_rsa_key" ]; then |
| log_message "INFO" "生成SSH RSA主机密钥..." |
| ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key -N '' 2>> "$SSH_ERROR_LOG" || true |
| fi |
| if [ ! -f "/etc/ssh/ssh_host_ecdsa_key" ]; then |
| log_message "INFO" "生成SSH ECDSA主机密钥..." |
| ssh-keygen -t ecdsa -f /etc/ssh/ssh_host_ecdsa_key -N '' 2>> "$SSH_ERROR_LOG" || true |
| fi |
| if [ ! -f "/etc/ssh/ssh_host_ed25519_key" ]; then |
| log_message "INFO" "生成SSH ED25519主机密钥..." |
| ssh-keygen -t ed25519 -f /etc/ssh/ssh_host_ed25519_key -N '' 2>> "$SSH_ERROR_LOG" || true |
| fi |
| |
| |
| log_message "INFO" "使用最保守的SSH配置..." |
| log_ssh_error "INFO" "使用最保守的SSH配置" |
| create_minimal_ssh_config |
| |
| |
| log_ssh_error "INFO" "验证SSH配置(详细模式)" |
| if ! sshd -t 2>> "$SSH_ERROR_LOG"; then |
| log_ssh_error "ERROR" "SSH配置验证失败,详细信息:" |
| local config_error_file=$(test_ssh_config_detailed) |
| log_ssh_error "ERROR" "配置测试详情已保存到: $config_error_file" |
| |
| log_message "ERROR" "SSH配置验证失败,使用默认配置" |
| create_default_ssh_config |
| fi |
| |
| |
| log_message "INFO" "强制启动SSH服务..." |
| log_ssh_error "INFO" "尝试启动SSH服务" |
| |
| local sshd_path="" |
| if [ -x "/usr/sbin/sshd" ]; then |
| sshd_path="/usr/sbin/sshd" |
| elif [ -x "/usr/bin/sshd" ]; then |
| sshd_path="/usr/bin/sshd" |
| else |
| log_ssh_error "ERROR" "找不到sshd可执行文件,尝试重新安装..." |
| local startup_diag=$(collect_startup_diagnostics) |
| log_ssh_error "ERROR" "启动诊断信息: $startup_diag" |
| send_notification "SSH二进制文件丢失" "找不到sshd可执行文件。诊断: $startup_diag" |
| |
| |
| if command -v apt-get &> /dev/null; then |
| log_ssh_error "INFO" "尝试安装openssh-server" |
| DEBIAN_FRONTEND=noninteractive apt-get install -y openssh-server 2>> "$SSH_ERROR_LOG" || true |
| fi |
| |
| |
| if [ -x "/usr/sbin/sshd" ]; then |
| sshd_path="/usr/sbin/sshd" |
| elif [ -x "/usr/bin/sshd" ]; then |
| sshd_path="/usr/bin/sshd" |
| else |
| log_ssh_error "ERROR" "重新安装后仍然找不到sshd,无法强制恢复" |
| local system_logs=$(collect_ssh_system_logs) |
| log_ssh_error "ERROR" "系统日志已保存到: $system_logs" |
| return 1 |
| fi |
| fi |
| |
| |
| if ! check_sshd_binary; then |
| log_ssh_error "ERROR" "sshd二进制文件检查失败" |
| local startup_diag=$(collect_startup_diagnostics) |
| log_ssh_error "ERROR" "启动诊断信息: $startup_diag" |
| send_notification "SSH二进制文件损坏" "sshd二进制文件检查失败。诊断: $startup_diag" |
| fi |
| |
| |
| log_ssh_error "INFO" "执行命令: $sshd_path" |
| local sshd_output=$(mktemp) |
| if ! $sshd_path 2> "$sshd_output"; then |
| log_ssh_error "ERROR" "SSH服务启动失败,详细错误信息:" |
| cat "$sshd_output" | while read line; do |
| log_ssh_error "ERROR" " $line" |
| done |
| |
| log_ssh_error "ERROR" "SSH守护进程调试信息:" |
| local sshd_debug=$(capture_sshd_errors) |
| log_ssh_error "ERROR" "调试信息已保存到: $sshd_debug" |
| |
| log_ssh_error "ERROR" "系统日志信息:" |
| local system_logs=$(collect_ssh_system_logs) |
| log_ssh_error "ERROR" "系统日志已保存到: $system_logs" |
| |
| send_notification "SSH强制启动失败" "SSH服务强制启动失败,详细错误信息已记录到: $SSH_ERROR_LOG" |
| rm -f "$sshd_output" |
| else |
| log_ssh_error "INFO" "SSH服务启动成功(无错误输出)" |
| rm -f "$sshd_output" |
| fi |
| |
| sleep 5 |
| |
| |
| if check_ssh_process && check_ssh_port; then |
| log_message "INFO" "强制恢复成功" |
| log_ssh_error "INFO" "强制恢复成功,SSH服务已正常运行" |
| reset_backoff |
| |
| |
| if [ -f "$SSH_ERROR_LOG" ]; then |
| log_message "INFO" "最近的SSH错误日志:" |
| show_recent_ssh_errors 10 |
| fi |
| |
| send_notification "SSH强制恢复成功" "SSH服务已通过强制恢复策略成功恢复" |
| |
| |
| save_config_checksum |
| |
| return 0 |
| else |
| log_message "ERROR" "强制恢复失败,应用指数退避" |
| log_ssh_error "ERROR" "强制恢复失败,应用指数退避" |
| apply_backoff |
| |
| local startup_diag=$(collect_startup_diagnostics) |
| log_ssh_error "ERROR" "启动诊断信息: $startup_diag" |
| |
| local system_logs=$(collect_ssh_system_logs) |
| log_ssh_error "ERROR" "系统日志已保存到: $system_logs" |
| |
| send_notification "SSH强制恢复失败" "SSH服务强制恢复失败,详细错误信息已记录到: $SSH_ERROR_LOG" |
| return 1 |
| fi |
| } |
|
|
| |
| cleanup_old_diagnostics() { |
| log_message "DEBUG" "清理旧的诊断文件..." |
| local diag_files=$(ls -t /tmp/ssh_diagnostics_*.log 2>/dev/null | tail -n +6) |
| if [ -n "$diag_files" ]; then |
| echo "$diag_files" | xargs rm -f 2>/dev/null || true |
| log_message "DEBUG" "已清理旧的诊断文件" |
| fi |
| } |
|
|
| |
| cleanup() { |
| log_message "INFO" "接收到终止信号,SSH服务看门狗开始清理..." |
| |
| |
| report_to_monitor_api "watchdog_stopping" "SSH watchdog is stopping" |
| |
| |
| if ! check_ssh_process; then |
| log_message "WARN" "退出前检测到SSH服务未运行,尝试最后一次启动..." |
| if [ -x "/usr/sbin/sshd" ]; then |
| /usr/sbin/sshd 2>> "$LOG_FILE" |
| elif [ -x "/usr/bin/sshd" ]; then |
| /usr/bin/sshd 2>> "$LOG_FILE" |
| fi |
| fi |
| |
| log_message "INFO" "SSH服务看门狗已退出" |
| exit 0 |
| } |
|
|
| |
| show_usage() { |
| cat <<EOF |
| SSH服务看门狗脚本 v4.0 |
| |
| 用法: $0 [选项] |
| |
| 选项: |
| --help, -h 显示此帮助信息 |
| --show-errors [行数] 显示最近的SSH错误日志(默认50行) |
| --show-log [行数] 显示看门狗主日志(默认50行) |
| --test-config 详细测试SSH配置文件 |
| --capture-errors 捕获SSH守护进程的详细错误 |
| --collect-logs 收集SSH相关的系统日志 |
| --check-binary 检查SSH二进制文件完整性 |
| --verify-config 验证SSH配置完整性 |
| --status 显示SSH服务当前状态 |
| |
| 示例: |
| $0 --show-errors 100 # 显示最近100条SSH错误日志 |
| $0 --test-config # 详细测试SSH配置文件 |
| $0 --status # 显示SSH服务状态 |
| |
| 不带参数运行时,脚本将启动看门狗监控循环。 |
| EOF |
| } |
|
|
| |
| if [ $# -gt 0 ]; then |
| case "$1" in |
| --help|-h) |
| show_usage |
| exit 0 |
| ;; |
| --show-errors) |
| lines=${2:-50} |
| show_recent_ssh_errors "$lines" |
| exit 0 |
| ;; |
| --show-log) |
| lines=${2:-50} |
| echo "=== 看门狗主日志(最后 $lines 行)===" |
| if [ -f "$LOG_FILE" ]; then |
| tail -n "$lines" "$LOG_FILE" |
| else |
| echo "日志文件不存在: $LOG_FILE" |
| fi |
| exit 0 |
| ;; |
| --test-config) |
| echo "=== 详细测试SSH配置文件 ===" |
| test_ssh_config_detailed |
| exit 0 |
| ;; |
| --capture-errors) |
| echo "=== 捕获SSH守护进程错误 ===" |
| capture_sshd_errors |
| exit 0 |
| ;; |
| --collect-logs) |
| echo "=== 收集SSH系统日志 ===" |
| collect_ssh_system_logs |
| exit 0 |
| ;; |
| --check-binary) |
| echo "=== 检查SSH二进制文件 ===" |
| if check_sshd_binary; then |
| echo "✓ SSH二进制文件检查通过" |
| else |
| echo "✗ SSH二进制文件检查失败" |
| exit 1 |
| fi |
| exit 0 |
| ;; |
| --verify-config) |
| echo "=== 验证SSH配置完整性 ===" |
| if verify_config_integrity; then |
| echo "✓ SSH配置完整性验证通过" |
| else |
| echo "✗ SSH配置完整性验证失败" |
| exit 1 |
| fi |
| exit 0 |
| ;; |
| --status) |
| echo "=== SSH服务状态 ===" |
| echo -n "SSH进程: " |
| if check_ssh_process; then |
| echo "运行中 (PIDs: $(pgrep -x sshd | tr '\n' ' '))" |
| else |
| echo "未运行" |
| fi |
| echo -n "SSH端口 $SSH_PORT: " |
| if check_ssh_port; then |
| echo "监听中" |
| else |
| echo "未监听" |
| fi |
| echo -n "SSH响应: " |
| if check_ssh_response; then |
| echo "正常" |
| else |
| echo "无响应" |
| fi |
| echo "" |
| echo "=== 最近的错误日志(最后10行)===" |
| if [ -f "$SSH_ERROR_LOG" ]; then |
| tail -10 "$SSH_ERROR_LOG" |
| else |
| echo "无错误日志" |
| fi |
| exit 0 |
| ;; |
| *) |
| echo "未知选项: $1" |
| show_usage |
| exit 1 |
| ;; |
| esac |
| fi |
|
|
| |
| trap cleanup SIGTERM SIGINT SIGHUP |
|
|
| |
| if [ "$EUID" -ne 0 ]; then |
| echo "ERROR: 此脚本需要以root权限运行" >&2 |
| exit 1 |
| fi |
|
|
| |
| mkdir -p "$(dirname "$LOG_FILE")" |
| mkdir -p "$(dirname "$SSH_ERROR_LOG")" |
| touch "$LOG_FILE" |
| touch "$SSH_ERROR_LOG" |
|
|
| |
| report_to_monitor_api "watchdog_started" "SSH watchdog started with check interval ${CHECK_INTERVAL}s" |
|
|
| |
| log_message "INFO" "==========================================" |
| log_message "INFO" "SSH服务看门狗即将启动主监控循环" |
| log_message "INFO" "==========================================" |
| log_message "INFO" "使用 '$0 --help' 查看可用命令" |
| log_message "INFO" "使用 '$0 --show-errors' 查看SSH错误日志" |
| main_loop |