action / scripts /check_ssh_health.sh
GGSheng's picture
feat: deploy Gemma 4 to hf space
020c337 verified
#!/bin/bash
# SSH服务健康检查脚本
# 用于Docker容器健康检查和监控
# 版本:2.0 - 增强版,支持JSON输出、详细诊断、API集成
# 配置
SSH_PORT=${SSH_PORT:-22}
TIMEOUT=${TIMEOUT:-5}
CHECK_PROCESS=true
CHECK_PORT=true
CHECK_RESPONSE=true
OUTPUT_FORMAT=${OUTPUT_FORMAT:-"text"} # text 或 json
DETAILED=${DETAILED:-false}
MONITOR_API_URL=${MONITOR_API_URL:-""}
# 帮助信息
show_help() {
echo "Usage: $0 [OPTIONS]"
echo "SSH Service Health Check Script"
echo ""
echo "Options:"
echo " -p, --port PORT SSH port (default: 22)"
echo " -t, --timeout SEC Connection timeout (default: 5)"
echo " --no-process Skip process check"
echo " --no-port Skip port check"
echo " --no-response Skip response check"
echo " -f, --format FMT Output format: text, json (default: text)"
echo " -d, --detailed Show detailed diagnostic information"
echo " -a, --api-url URL Report results to monitoring API"
echo " -h, --help Show this help"
echo ""
echo "Exit codes:"
echo " 0 Healthy"
echo " 1 Unhealthy"
echo " 2 Degraded"
}
# 解析命令行参数
while [[ $# -gt 0 ]]; do
case $1 in
-p|--port)
SSH_PORT="$2"
shift 2
;;
-t|--timeout)
TIMEOUT="$2"
shift 2
;;
--no-process)
CHECK_PROCESS=false
shift
;;
--no-port)
CHECK_PORT=false
shift
;;
--no-response)
CHECK_RESPONSE=false
shift
;;
-f|--format)
OUTPUT_FORMAT="$2"
shift 2
;;
-d|--detailed)
DETAILED=true
shift
;;
-a|--api-url)
MONITOR_API_URL="$2"
shift 2
;;
-h|--help)
show_help
exit 0
;;
*)
echo "Unknown option: $1"
show_help
exit 1
;;
esac
done
# 收集详细诊断信息
collect_detailed_diagnostics() {
echo ""
echo "=== Detailed Diagnostics ==="
echo ""
echo "--- System Information ---"
uname -a
echo ""
echo "--- SSH Processes ---"
ps aux | grep sshd | grep -v grep || echo "No SSH processes found"
echo ""
echo "--- Network Connections (SSH port $SSH_PORT) ---"
if command -v netstat &> /dev/null; then
netstat -tulnp 2>/dev/null | grep ":$SSH_PORT " || echo "Port $SSH_PORT not listening"
elif command -v ss &> /dev/null; then
ss -tulnp 2>/dev/null | grep ":$SSH_PORT " || echo "Port $SSH_PORT not listening"
fi
echo ""
echo "--- SSH Configuration Test ---"
if [ -f "/etc/ssh/sshd_config" ]; then
sshd -t 2>&1 || echo "SSH configuration has errors"
else
echo "SSH configuration file not found"
fi
echo ""
echo "--- System Resources ---"
echo "Memory Usage:"
free -h
echo ""
echo "Disk Usage:"
df -h
echo ""
echo "System Load:"
uptime
echo ""
echo "--- Recent SSH Logs ---"
if command -v journalctl &> /dev/null; then
journalctl -u ssh --no-pager -n 10 2>/dev/null || echo "Cannot retrieve journalctl logs"
elif [ -f "/var/log/auth.log" ]; then
tail -10 /var/log/auth.log 2>/dev/null || echo "Cannot read /var/log/auth.log"
elif [ -f "/var/log/secure" ]; then
tail -10 /var/log/secure 2>/dev/null || echo "Cannot read /var/log/secure"
fi
echo ""
}
# 健康检查函数
check_ssh_health() {
local errors=0
local warnings=0
local sshd_pids=""
local port_listening=false
local ssh_responding=false
local start_time=$(date +%s)
# 收集检查结果
local check_results=""
# 1. 检查SSH进程
if [ "$CHECK_PROCESS" = true ]; then
if pgrep -x "sshd" > /dev/null; then
sshd_pids=$(pgrep -x "sshd" | tr '\n' ' ' | sed 's/ $//')
check_results="${check_results}{\"check\":\"process\",\"status\":\"ok\",\"pids\":\"$sshd_pids\"}"
else
check_results="${check_results}{\"check\":\"process\",\"status\":\"failed\",\"message\":\"SSH process not found\"}"
errors=$((errors + 1))
fi
fi
# 2. 检查SSH端口
if [ "$CHECK_PORT" = true ]; then
if netstat -tuln 2>/dev/null | grep -q ":$SSH_PORT "; then
port_listening=true
check_results="${check_results}{\"check\":\"port\",\"status\":\"ok\",\"port\":$SSH_PORT}"
elif ss -tuln 2>/dev/null | grep -q ":$SSH_PORT "; then
port_listening=true
check_results="${check_results}{\"check\":\"port\",\"status\":\"ok\",\"port\":$SSH_PORT}"
else
check_results="${check_results}{\"check\":\"port\",\"status\":\"failed\",\"port\":$SSH_PORT,\"message\":\"Port not listening\"}"
errors=$((errors + 1))
fi
fi
# 3. 检查SSH响应
if [ "$CHECK_RESPONSE" = true ]; then
if timeout "$TIMEOUT" bash -c "</dev/tcp/localhost/$SSH_PORT" 2>/dev/null; then
ssh_responding=true
check_results="${check_results}{\"check\":\"response\",\"status\":\"ok\",\"response_time\":\"${TIMEOUT}s\"}"
else
check_results="${check_results}{\"check\":\"response\",\"status\":\"warning\",\"message\":\"SSH not responding within ${TIMEOUT}s\"}"
warnings=$((warnings + 1))
fi
fi
# 计算检查耗时
local end_time=$(date +%s)
local duration=$((end_time - start_time))
# 判断最终状态
local health_status="unknown"
local exit_code=0
if [ $errors -gt 0 ]; then
health_status="unhealthy"
exit_code=1
elif [ $warnings -gt 0 ]; then
health_status="degraded"
exit_code=2
else
health_status="healthy"
exit_code=0
fi
# 输出结果
if [ "$OUTPUT_FORMAT" = "json" ]; then
# JSON输出格式
cat <<EOF
{
"timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
"ssh_port": $SSH_PORT,
"timeout": $TIMEOUT,
"health_status": "$health_status",
"errors": $errors,
"warnings": $warnings,
"check_duration_seconds": $duration,
"sshd_pids": "$sshd_pids",
"port_listening": $port_listening,
"ssh_responding": $ssh_responding,
"checks": [$check_results]
}
EOF
else
# 文本输出格式
echo "=== SSH Service Health Check ==="
echo "Time: $(date '+%Y-%m-%d %H:%M:%S')"
echo "SSH Port: $SSH_PORT"
echo "Timeout: ${TIMEOUT}s"
echo "Duration: ${duration}s"
echo ""
# 1. 检查SSH进程
if [ "$CHECK_PROCESS" = true ]; then
echo -n "[1/3] Checking SSH process... "
if [ -n "$sshd_pids" ]; then
echo "OK (PIDs: $sshd_pids)"
else
echo "FAILED - SSH process not found"
fi
fi
# 2. 检查SSH端口
if [ "$CHECK_PORT" = true ]; then
echo -n "[2/3] Checking SSH port $SSH_PORT... "
if [ "$port_listening" = true ]; then
echo "OK - Port is listening"
else
echo "FAILED - Port $SSH_PORT not listening"
fi
fi
# 3. 检查SSH响应
if [ "$CHECK_RESPONSE" = true ]; then
echo -n "[3/3] Checking SSH response... "
if [ "$ssh_responding" = true ]; then
echo "OK - SSH is responding"
else
echo "WARNING - SSH port open but not responding (may be starting up)"
fi
fi
echo ""
echo "=== Health Check Result ==="
echo "Status: $(echo $health_status | tr '[:lower:]' '[:upper:]')"
echo "Errors: $errors"
echo "Warnings: $warnings"
# 如果需要详细信息,收集详细诊断
if [ "$DETAILED" = true ]; then
collect_detailed_diagnostics
fi
fi
# 报告到监控API
if [ -n "$MONITOR_API_URL" ]; then
report_to_api "$health_status" "$errors" "$warnings"
fi
return $exit_code
}
# 报告结果到监控API
report_to_api() {
local status=$1
local errors=$2
local warnings=$3
local payload=$(cat <<EOF
{
"service": "ssh",
"status": "$status",
"errors": $errors,
"warnings": $warnings,
"timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
"source": "check_ssh_health.sh"
}
EOF
)
curl -s -X POST "$MONITOR_API_URL" \
-H 'Content-Type: application/json' \
-d "$payload" >/dev/null 2>&1 || true
}
# 执行健康检查
check_ssh_health
exit $?