#!/bin/bash # SSH服务健康检查脚本 # 用于Docker容器健康检查和监控 # 版本:2.0 - 增强版,支持JSON输出、详细诊断、API集成 # 配置 SSH_PORT=${SSH_PORT:-22} TIMEOUT=${TIMEOUT:-5} CHECK_PROCESS=true CHECK_PORT=true CHECK_RESPONSE=true OUTPUT_FORMAT=${OUTPUT_FORMAT:-"text"} # text 或 json DETAILED=${DETAILED:-false} MONITOR_API_URL=${MONITOR_API_URL:-""} # 帮助信息 show_help() { echo "Usage: $0 [OPTIONS]" echo "SSH Service Health Check Script" echo "" echo "Options:" echo " -p, --port PORT SSH port (default: 22)" echo " -t, --timeout SEC Connection timeout (default: 5)" echo " --no-process Skip process check" echo " --no-port Skip port check" echo " --no-response Skip response check" echo " -f, --format FMT Output format: text, json (default: text)" echo " -d, --detailed Show detailed diagnostic information" echo " -a, --api-url URL Report results to monitoring API" echo " -h, --help Show this help" echo "" echo "Exit codes:" echo " 0 Healthy" echo " 1 Unhealthy" echo " 2 Degraded" } # 解析命令行参数 while [[ $# -gt 0 ]]; do case $1 in -p|--port) SSH_PORT="$2" shift 2 ;; -t|--timeout) TIMEOUT="$2" shift 2 ;; --no-process) CHECK_PROCESS=false shift ;; --no-port) CHECK_PORT=false shift ;; --no-response) CHECK_RESPONSE=false shift ;; -f|--format) OUTPUT_FORMAT="$2" shift 2 ;; -d|--detailed) DETAILED=true shift ;; -a|--api-url) MONITOR_API_URL="$2" shift 2 ;; -h|--help) show_help exit 0 ;; *) echo "Unknown option: $1" show_help exit 1 ;; esac done # 收集详细诊断信息 collect_detailed_diagnostics() { echo "" echo "=== Detailed Diagnostics ===" echo "" echo "--- System Information ---" uname -a echo "" echo "--- SSH Processes ---" ps aux | grep sshd | grep -v grep || echo "No SSH processes found" echo "" echo "--- Network Connections (SSH port $SSH_PORT) ---" if command -v netstat &> /dev/null; then netstat -tulnp 2>/dev/null | grep ":$SSH_PORT " || echo "Port $SSH_PORT not listening" elif command -v ss &> /dev/null; then ss -tulnp 2>/dev/null | grep ":$SSH_PORT " || echo "Port $SSH_PORT not listening" fi echo "" echo "--- SSH Configuration Test ---" if [ -f "/etc/ssh/sshd_config" ]; then sshd -t 2>&1 || echo "SSH configuration has errors" else echo "SSH configuration file not found" fi echo "" echo "--- System Resources ---" echo "Memory Usage:" free -h echo "" echo "Disk Usage:" df -h echo "" echo "System Load:" uptime echo "" echo "--- Recent SSH Logs ---" if command -v journalctl &> /dev/null; then journalctl -u ssh --no-pager -n 10 2>/dev/null || echo "Cannot retrieve journalctl logs" elif [ -f "/var/log/auth.log" ]; then tail -10 /var/log/auth.log 2>/dev/null || echo "Cannot read /var/log/auth.log" elif [ -f "/var/log/secure" ]; then tail -10 /var/log/secure 2>/dev/null || echo "Cannot read /var/log/secure" fi echo "" } # 健康检查函数 check_ssh_health() { local errors=0 local warnings=0 local sshd_pids="" local port_listening=false local ssh_responding=false local start_time=$(date +%s) # 收集检查结果 local check_results="" # 1. 检查SSH进程 if [ "$CHECK_PROCESS" = true ]; then if pgrep -x "sshd" > /dev/null; then sshd_pids=$(pgrep -x "sshd" | tr '\n' ' ' | sed 's/ $//') check_results="${check_results}{\"check\":\"process\",\"status\":\"ok\",\"pids\":\"$sshd_pids\"}" else check_results="${check_results}{\"check\":\"process\",\"status\":\"failed\",\"message\":\"SSH process not found\"}" errors=$((errors + 1)) fi fi # 2. 检查SSH端口 if [ "$CHECK_PORT" = true ]; then if netstat -tuln 2>/dev/null | grep -q ":$SSH_PORT "; then port_listening=true check_results="${check_results}{\"check\":\"port\",\"status\":\"ok\",\"port\":$SSH_PORT}" elif ss -tuln 2>/dev/null | grep -q ":$SSH_PORT "; then port_listening=true check_results="${check_results}{\"check\":\"port\",\"status\":\"ok\",\"port\":$SSH_PORT}" else check_results="${check_results}{\"check\":\"port\",\"status\":\"failed\",\"port\":$SSH_PORT,\"message\":\"Port not listening\"}" errors=$((errors + 1)) fi fi # 3. 检查SSH响应 if [ "$CHECK_RESPONSE" = true ]; then if timeout "$TIMEOUT" bash -c "/dev/null; then ssh_responding=true check_results="${check_results}{\"check\":\"response\",\"status\":\"ok\",\"response_time\":\"${TIMEOUT}s\"}" else check_results="${check_results}{\"check\":\"response\",\"status\":\"warning\",\"message\":\"SSH not responding within ${TIMEOUT}s\"}" warnings=$((warnings + 1)) fi fi # 计算检查耗时 local end_time=$(date +%s) local duration=$((end_time - start_time)) # 判断最终状态 local health_status="unknown" local exit_code=0 if [ $errors -gt 0 ]; then health_status="unhealthy" exit_code=1 elif [ $warnings -gt 0 ]; then health_status="degraded" exit_code=2 else health_status="healthy" exit_code=0 fi # 输出结果 if [ "$OUTPUT_FORMAT" = "json" ]; then # JSON输出格式 cat </dev/null 2>&1 || true } # 执行健康检查 check_ssh_health exit $?