elonmusk / scripts /ssh_service_watchdog.sh
GGSheng's picture
feat: deploy Gemma 4 to hf space
3b47d98 verified
#!/bin/bash
# SSH服务看门狗脚本 - 确保SSH服务持续可用
# 功能:监控SSH服务状态,异常时自动恢复,并记录详细日志
# 版本:4.0 - 增强版:详细错误日志输出、配置完整性校验、重启频率限制、指数退避
# 配置参数(可通过环境变量覆盖)
SSH_PORT=${SSH_PORT:-22}
CHECK_INTERVAL=${CHECK_INTERVAL:-30}
MAX_RETRIES=${MAX_RETRIES:-3}
LOG_FILE="/var/log/ssh_watchdog.log"
SSH_ERROR_LOG="/var/log/ssh_error.log"
MAX_LOG_SIZE=10485760 # 10MB
SSH_CONFIG="/etc/ssh/sshd_config"
SSH_BACKUP_CONFIG="/etc/ssh/sshd_config.backup"
SSH_CONFIG_CHECKSUM="/etc/ssh/sshd_config.checksum"
NOTIFICATION_ENABLED=${NOTIFICATION_ENABLED:-false}
NOTIFICATION_WEBHOOK=${NOTIFICATION_WEBHOOK:-}
MONITOR_API_URL=${MONITOR_API_URL:-"http://localhost:7680/api/terminal"}
# 重启频率限制
MAX_RESTARTS_PER_HOUR=${MAX_RESTARTS_PER_HOUR:-10}
restart_count=0
last_restart_hour=$(date +%Y%m%d%H)
# 指数退避配置
BACKOFF_BASE=${BACKOFF_BASE:-2}
BACKOFF_MAX=${BACKOFF_MAX:-300} # 最大退避时间(秒)
current_backoff=1
# SSH错误日志函数
log_ssh_error() {
local level=$1
local message=$2
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
local error_file="$SSH_ERROR_LOG"
# 确保错误日志目录存在
mkdir -p "$(dirname "$error_file")"
touch "$error_file"
# 写入错误日志
echo "[$timestamp] [$level] $message" >> "$error_file"
# 同时记录到主日志
log_message "$level" "$message"
}
# 颜色定义(用于日志)
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# 确保日志目录存在
mkdir -p "$(dirname "$LOG_FILE")"
touch "$LOG_FILE"
# ========== 新增:配置完整性校验 ==========
# 计算配置文件校验和
calculate_config_checksum() {
sha256sum "$SSH_CONFIG" 2>/dev/null | awk '{print $1}'
}
# 保存配置文件校验和
save_config_checksum() {
local checksum=$(calculate_config_checksum)
if [ -n "$checksum" ]; then
echo "$checksum" > "$SSH_CONFIG_CHECKSUM"
log_message "DEBUG" "配置文件校验和已保存: $checksum"
fi
}
# 验证配置文件完整性
verify_config_integrity() {
if [ ! -f "$SSH_CONFIG_CHECKSUM" ]; then
log_message "WARN" "配置文件校验和不存在,跳过完整性检查"
save_config_checksum
return 0
fi
local current_checksum=$(calculate_config_checksum)
local saved_checksum=$(cat "$SSH_CONFIG_CHECKSUM" 2>/dev/null)
if [ "$current_checksum" != "$saved_checksum" ]; then
log_message "ERROR" "配置文件可能被篡改!校验和不匹配"
log_message "ERROR" "期望: $saved_checksum"
log_message "ERROR" "当前: $current_checksum"
return 1
fi
return 0
}
# ========== 新增:重启频率限制 ==========
# 检查是否超过重启频率限制
check_restart_rate_limit() {
local current_hour=$(date +%Y%m%d%H)
# 如果进入新的一小时,重置计数器
if [ "$current_hour" != "$last_restart_hour" ]; then
restart_count=0
last_restart_hour=$current_hour
fi
if [ $restart_count -ge $MAX_RESTARTS_PER_HOUR ]; then
log_message "ERROR" "重启频率超限!过去1小时内已重启 $restart_count 次(最大 $MAX_RESTARTS_PER_HOUR 次)"
return 1
fi
restart_count=$((restart_count + 1))
log_message "DEBUG" "重启计数: $restart_count/$MAX_RESTARTS_PER_HOUR (小时: $current_hour)"
return 0
}
# ========== 新增:指数退避 ==========
# 计算退避时间
calculate_backoff() {
local backoff=$(( BACKOFF_BASE ** current_backoff ))
if [ $backoff -gt $BACKOFF_MAX ]; then
backoff=$BACKOFF_MAX
fi
echo $backoff
}
# 执行指数退避等待
apply_backoff() {
local backoff_time=$(calculate_backoff)
log_message "WARN" "应用指数退避:等待 ${backoff_time} 秒后重试..."
sleep $backoff_time
current_backoff=$((current_backoff + 1))
}
# 重置退避计数器(成功时调用)
reset_backoff() {
if [ $current_backoff -gt 1 ]; then
log_message "INFO" "恢复成功,重置退避计数器"
fi
current_backoff=1
}
# ========== 新增:详细SSH错误日志捕获 ==========
# 捕获SSH守护进程的详细错误信息
capture_sshd_errors() {
local error_output_file="/tmp/sshd_error_output_$(date +%Y%m%d_%H%M%S).log"
log_ssh_error "INFO" "正在捕获SSH守护进程的详细错误信息..."
{
echo "=== SSH守护进程错误捕获 ==="
echo "捕获时间: $(date '+%Y-%m-%d %H:%M:%S')"
echo ""
echo "=== 1. SSH配置测试结果 ==="
sshd -t 2>&1 | tee -a "$error_output_file"
echo ""
echo "=== 2. SSH守护进程调试输出(前10行)==="
# 运行sshd在调试模式,只运行几秒钟
timeout 3 sshd -d -D 2>&1 | head -10 | tee -a "$error_output_file"
echo ""
echo "=== 3. SSH配置文件的语法检查 ==="
sshd -T 2>&1 | head -50 | tee -a "$error_output_file"
echo ""
} >> "$error_output_file" 2>&1
log_ssh_error "INFO" "SSH错误详情已保存到: $error_output_file"
echo "$error_output_file"
}
# 收集SSH相关的系统日志
collect_ssh_system_logs() {
local log_file="/tmp/ssh_system_logs_$(date +%Y%m%d_%H%M%S).log"
log_ssh_error "INFO" "正在收集SSH相关的系统日志..."
{
echo "=== SSH系统日志收集 ==="
echo "收集时间: $(date '+%Y-%m-%d %H:%M:%S')"
echo ""
echo "=== 1. 系统日志中的SSH相关记录(最近50行)==="
if command -v journalctl &> /dev/null; then
journalctl -u ssh --no-pager -n 50 2>&1 || echo "无法获取journalctl日志"
elif [ -f "/var/log/auth.log" ]; then
tail -50 /var/log/auth.log 2>&1 | grep -i ssh || echo "无SSH相关日志"
elif [ -f "/var/log/secure" ]; then
tail -50 /var/log/secure 2>&1 | grep -i ssh || echo "无SSH相关日志"
else
echo "未找到SSH日志文件"
fi
echo ""
echo "=== 2. SSH进程当前状态 ==="
ps aux | grep sshd | grep -v grep || echo "无SSH进程运行"
echo ""
echo "=== 3. 网络连接状态(SSH端口)==="
if command -v netstat &> /dev/null; then
netstat -tulnp 2>&1 | grep ":$SSH_PORT " || echo "端口 $SSH_PORT 无监听"
elif command -v ss &> /dev/null; then
ss -tulnp 2>&1 | grep ":$SSH_PORT " || echo "端口 $SSH_PORT 无监听"
fi
echo ""
echo "=== 4. 最近的系统消息(dmesg)==="
if command -v dmesg &> /dev/null; then
dmesg | tail -50 2>&1 | grep -i "ssh\|network\|connection" || echo "无相关内核日志"
fi
echo ""
echo "=== 5. SSH配置文件权限检查 ==="
ls -la /etc/ssh/ 2>&1 || echo "无法检查/etc/ssh权限"
echo ""
echo "=== 6. 文件系统状态 ==="
df -h /etc/ssh 2>&1 || echo "无法检查文件系统"
echo ""
} >> "$log_file" 2>&1
log_ssh_error "INFO" "SSH系统日志已保存到: $log_file"
echo "$log_file"
}
# 显示最近的SSH错误日志
show_recent_ssh_errors() {
local lines=${1:-50} # 默认显示50行
echo "=== 最近的SSH错误日志(最后 $lines 行)==="
echo ""
if [ -f "$SSH_ERROR_LOG" ]; then
tail -n "$lines" "$SSH_ERROR_LOG"
else
echo "SSH错误日志文件不存在: $SSH_ERROR_LOG"
fi
echo ""
echo "=== 最近的系统日志中的SSH错误 ==="
if command -v journalctl &> /dev/null; then
journalctl -u ssh --no-pager -n "$lines" 2>&1 | grep -i "error\|fail\|denied" || echo "无错误日志"
elif [ -f "/var/log/auth.log" ]; then
tail -n "$lines" /var/log/auth.log 2>&1 | grep -i "error\|fail\|denied" || echo "无错误日志"
fi
}
# 增强的SSH配置测试(带详细错误输出)
test_ssh_config_detailed() {
local error_file="/tmp/ssh_config_test_$(date +%Y%m%d_%H%M%S).log"
log_ssh_error "INFO" "正在详细测试SSH配置文件..."
{
echo "=== SSH配置详细测试 ==="
echo "测试时间: $(date '+%Y-%m-%d %H:%M:%S')"
echo ""
echo "=== 1. 配置语法检查 ==="
sshd -t 2>&1
local syntax_result=$?
echo "语法检查结果: $syntax_result"
echo ""
if [ $syntax_result -ne 0 ]; then
echo "=== 2. 配置详细信息(sshd -T)==="
sshd -T 2>&1 | head -100
echo ""
echo "=== 3. 配置文件内容(敏感信息已隐藏)==="
if [ -f "$SSH_CONFIG" ]; then
grep -v "^#" "$SSH_CONFIG" | grep -v "^$" | sed 's/PasswordAuthentication.*/PasswordAuthentication [HIDDEN]/g' | head -50
fi
echo ""
fi
} > "$error_file" 2>&1
local errors=$(cat "$error_file")
log_ssh_error "ERROR" "SSH配置测试发现错误:"
log_ssh_error "ERROR" "$errors"
echo "$error_file"
}
# ========== 新增:二进制完整性检查 ==========
# 检查SSH二进制文件完整性
check_sshd_binary() {
local sshd_path=""
if [ -x "/usr/sbin/sshd" ]; then
sshd_path="/usr/sbin/sshd"
elif [ -x "/usr/bin/sshd" ]; then
sshd_path="/usr/bin/sshd"
else
log_message "ERROR" "找不到sshd可执行文件"
return 1
fi
# 检查文件是否存在且可执行
if [ ! -x "$sshd_path" ]; then
log_message "ERROR" "sshd二进制文件不可执行: $sshd_path"
return 1
fi
# 检查依赖库(使用ldd)
if command -v ldd &> /dev/null; then
local missing_libs=$(ldd "$sshd_path" 2>&1 | grep "not found" || true)
if [ -n "$missing_libs" ]; then
log_message "ERROR" "sshd依赖库缺失:"
echo "$missing_libs" | while read line; do
log_message "ERROR" " $line"
done
return 1
fi
fi
# 尝试运行sshd -t(配置测试)
if ! "$sshd_path" -t 2>&1; then
log_message "ERROR" "sshd配置测试失败"
return 1
fi
log_message "DEBUG" "sshd二进制文件检查通过: $sshd_path"
return 0
}
# ========== 新增:详细诊断 ==========
# 收集启动失败的详细诊断信息
collect_startup_diagnostics() {
local diag_file="/tmp/ssh_startup_diagnostics_$(date +%Y%m%d_%H%M%S).log"
log_message "INFO" "收集启动诊断信息: $diag_file"
{
echo "=== SSH启动诊断报告 ==="
echo "时间: $(date '+%Y-%m-%d %H:%M:%S')"
echo ""
echo "=== SSH二进制文件 ==="
which sshd 2>/dev/null || echo "sshd未找到在PATH中"
ls -la /usr/sbin/sshd /usr/bin/sshd 2>/dev/null || echo "sshd二进制文件不存在"
file /usr/sbin/sshd /usr/bin/sshd 2>/dev/null || echo "无法检查文件类型"
echo ""
echo "=== 依赖库检查 ==="
if command -v ldd &> /dev/null; then
ldd /usr/sbin/sshd 2>/dev/null || ldd /usr/bin/sshd 2>/dev/null || echo "无法检查依赖库"
fi
echo ""
echo "=== 配置文件检查 ==="
if [ -f "$SSH_CONFIG" ]; then
echo "配置文件存在: $SSH_CONFIG"
echo "配置文件权限: $(ls -la "$SSH_CONFIG")"
echo "配置文件大小: $(stat -c%s "$SSH_CONFIG" 2>/dev/null || echo '未知') 字节"
echo ""
echo "=== 配置内容(敏感信息已隐藏)==="
grep -v "^#" "$SSH_CONFIG" | grep -v "^$" | sed 's/PasswordAuthentication.*/PasswordAuthentication [HIDDEN]/g' | head -50
else
echo "配置文件不存在: $SSH_CONFIG"
fi
echo ""
echo "=== 端口占用检查 ==="
if command -v netstat &> /dev/null; then
netstat -tulnp 2>/dev/null | grep ":$SSH_PORT " || echo "端口 $SSH_PORT 未被占用"
elif command -v ss &> /dev/null; then
ss -tulnp 2>/dev/null | grep ":$SSH_PORT " || echo "端口 $SSH_PORT 未被占用"
fi
echo ""
echo "=== 最近的错误日志 ==="
if [ -f "/var/log/auth.log" ]; then
tail -50 /var/log/auth.log 2>/dev/null | grep -i ssh || echo "无SSH相关日志"
elif [ -f "/var/log/secure" ]; then
tail -50 /var/log/secure 2>/dev/null | grep -i ssh || echo "无SSH相关日志"
elif command -v journalctl &> /dev/null; then
journalctl -u ssh --no-pager -n 50 2>/dev/null || echo "无法获取journalctl日志"
fi
echo ""
echo "=== 文件系统检查 ==="
df -h /etc/ssh 2>/dev/null || echo "无法检查文件系统"
echo ""
echo "=== 权限检查 ==="
ls -la /etc/ssh/ 2>/dev/null || echo "无法检查/etc/ssh权限"
ls -la /var/run/ 2>/dev/null | grep ssh || echo "无法检查/var/run/ssh"
echo ""
} > "$diag_file" 2>&1
log_message "INFO" "启动诊断信息已保存: $diag_file"
echo "$diag_file"
}
# 日志函数
log_message() {
local level=$1
local message=$2
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
# 日志轮转:如果日志文件超过最大大小,则备份并创建新文件
if [ -f "$LOG_FILE" ]; then
local file_size=$(stat -c%s "$LOG_FILE" 2>/dev/null || echo "0")
if [ "$file_size" -gt $MAX_LOG_SIZE ]; then
mv "$LOG_FILE" "${LOG_FILE}.$(date +%Y%m%d_%H%M%S)" 2>/dev/null
touch "$LOG_FILE"
fi
fi
# 写入日志文件(使用printf确保立即刷新)
printf "[%s] [%s] %s\n" "$timestamp" "$level" "$message" >> "$LOG_FILE"
# 同时输出到控制台(如果控制台支持颜色,则使用颜色)
if [ -t 1 ]; then
# 终端环境:使用颜色
case $level in
"INFO")
printf "${GREEN}[%s] [INFO] %s${NC}\n" "$timestamp" "$message" >&1
;;
"WARN")
printf "${YELLOW}[%s] [WARN] %s${NC}\n" "$timestamp" "$message" >&1
;;
"ERROR")
printf "${RED}[%s] [ERROR] %s${NC}\n" "$timestamp" "$message" >&2
;;
"DEBUG")
printf "${BLUE}[%s] [DEBUG] %s${NC}\n" "$timestamp" "$message" >&1
;;
*)
printf "[%s] [%s] %s\n" "$timestamp" "$level" "$message" >&1
;;
esac
else
# 非终端环境(如Docker logs):不使用颜色,输出到stdout/stderr
if [ "$level" = "ERROR" ]; then
printf "[%s] [%s] %s\n" "$timestamp" "$level" "$message" >&2
else
printf "[%s] [%s] %s\n" "$timestamp" "$level" "$message" >&1
fi
fi
}
# 发送通知
send_notification() {
local subject=$1
local message=$2
if [ "$NOTIFICATION_ENABLED" != "true" ]; then
return 0
fi
log_message "INFO" "发送通知: $subject"
# Webhook通知(支持企业微信、钉钉等)
if [ -n "$NOTIFICATION_WEBHOOK" ]; then
local payload=$(cat <<EOF
{
"msgtype": "text",
"text": {
"content": "SSH看门狗告警\n标题: $subject\n详情: $message\n时间: $(date '+%Y-%m-%d %H:%M:%S')"
}
}
EOF
)
curl -s -X POST "$NOTIFICATION_WEBHOOK" \
-H 'Content-Type: application/json' \
-d "$payload" >> "$LOG_FILE" 2>&1
fi
# 可扩展:邮件通知、短信通知等
}
# 收集诊断信息
collect_diagnostics() {
local diag_file="/tmp/ssh_diagnostics_$(date +%Y%m%d_%H%M%S).log"
log_message "INFO" "收集诊断信息到: $diag_file"
{
echo "=== SSH服务诊断报告 ==="
echo "收集时间: $(date '+%Y-%m-%d %H:%M:%S')"
echo ""
echo "=== 系统信息 ==="
uname -a
echo ""
echo "=== SSH进程状态 ==="
ps aux | grep sshd | grep -v grep || echo "无SSH进程运行"
echo ""
echo "=== 网络连接状态 ==="
netstat -tulnp 2>/dev/null | grep :$SSH_PORT || echo "端口 $SSH_PORT 无监听"
echo ""
echo "=== SSH配置检查 ==="
if [ -f "$SSH_CONFIG" ]; then
sshd -t 2>&1 || echo "SSH配置文件有错误"
else
echo "SSH配置文件不存在: $SSH_CONFIG"
fi
echo ""
echo "=== 系统资源状态 ==="
echo "内存使用:"
free -h
echo ""
echo "磁盘使用:"
df -h
echo ""
echo "系统负载:"
uptime
echo ""
echo "=== 最近的系统日志(SSH相关)==="
if command -v journalctl &> /dev/null; then
journalctl -u ssh --no-pager -n 50 2>/dev/null || echo "无法获取journalctl日志"
else
tail -100 /var/log/auth.log 2>/dev/null || tail -100 /var/log/secure 2>/dev/null || echo "无法获取SSH日志"
fi
echo ""
echo "=== 防火墙状态 ==="
if command -v iptables &> /dev/null; then
iptables -L -n 2>/dev/null | head -50
fi
if command -v ufw &> /dev/null; then
ufw status 2>/dev/null
fi
echo ""
} > "$diag_file" 2>&1
log_message "INFO" "诊断信息已保存到: $diag_file"
echo "$diag_file" # 返回诊断文件路径
}
# 检查SSH服务进程是否存在
check_ssh_process() {
if pgrep -x "sshd" > /dev/null; then
return 0 # SSH进程存在
else
return 1 # SSH进程不存在
fi
}
# 检查SSH端口是否监听
check_ssh_port() {
if netstat -tuln | grep -q ":$SSH_PORT "; then
return 0 # 端口正在监听
else
return 1 # 端口未监听
fi
}
# 检查SSH服务是否响应连接
check_ssh_response() {
# 使用timeout和bash的/dev/tcp进行简单连接测试
timeout 5 bash -c "</dev/tcp/localhost/$SSH_PORT" 2>/dev/null
return $?
}
# 检查系统资源(内存、磁盘空间)
check_system_resources() {
local memory_usage=$(free | grep Mem | awk '{print ($3/$2) * 100}' | cut -d. -f1)
local disk_usage=$(df / | tail -1 | awk '{print $5}' | sed 's/%//g')
if [ "$memory_usage" -gt 90 ]; then
log_message "WARN" "内存使用率过高: ${memory_usage}%"
return 1
fi
if [ "$disk_usage" -gt 90 ]; then
log_message "WARN" "磁盘使用率过高: ${disk_usage}%"
return 1
fi
return 0
}
# 尝试修复SSH服务
repair_ssh_service() {
log_message "INFO" "开始尝试修复SSH服务..."
# 0. 检查重启频率限制
if ! check_restart_rate_limit; then
send_notification "SSH重启频率超限" "过去1小时内已尝试重启 $MAX_RESTARTS_PER_HOUR 次,暂停自动重启"
return 1
fi
# 1. 收集修复前的诊断信息
local diag_file=$(collect_diagnostics)
send_notification "SSH服务异常" "SSH服务检测到异常,开始尝试修复。诊断信息: $diag_file"
# 2. 停止现有的SSH服务(如果存在)
log_message "INFO" "停止现有的SSH服务..."
killall sshd 2>/dev/null
sleep 2
# 3. 检查SSH二进制文件完整性
log_message "INFO" "检查SSH二进制文件完整性..."
if ! check_sshd_binary; then
log_message "ERROR" "SSH二进制文件检查失败,尝试收集更多信息..."
local startup_diag=$(collect_startup_diagnostics)
send_notification "SSH二进制文件损坏" "SSH二进制文件检查失败,诊断信息: $startup_diag"
fi
# 4. 验证配置完整性
log_message "INFO" "验证SSH配置完整性..."
if ! verify_config_integrity; then
log_ssh_error "WARN" "配置完整性验证失败,使用备份配置"
if [ -f "$SSH_BACKUP_CONFIG" ]; then
cp "$SSH_BACKUP_CONFIG" "$SSH_CONFIG"
log_message "INFO" "已恢复备份配置"
fi
fi
# 5. 检查SSH配置(带详细错误输出)
log_message "INFO" "检查SSH配置文件..."
if ! sshd -t 2>> "$SSH_ERROR_LOG"; then
log_ssh_error "ERROR" "SSH配置文件有错误,详细信息:"
local config_error_file=$(test_ssh_config_detailed)
log_ssh_error "ERROR" "配置文件测试详情已保存到: $config_error_file"
# 配置修复逻辑:使用备份配置或生成最小配置
if [ -f "$SSH_BACKUP_CONFIG" ]; then
log_message "INFO" "使用备份配置文件: $SSH_BACKUP_CONFIG"
cp "$SSH_BACKUP_CONFIG" "$SSH_CONFIG"
else
log_message "WARN" "备份配置不存在,创建最小配置"
create_minimal_ssh_config
fi
# 再次检查配置
if ! sshd -t 2>> "$SSH_ERROR_LOG"; then
log_ssh_error "ERROR" "配置修复失败,使用默认配置"
create_default_ssh_config
fi
fi
# 6. 备份当前配置(如果配置有效)
if sshd -t 2>/dev/null; then
cp "$SSH_CONFIG" "$SSH_BACKUP_CONFIG" 2>/dev/null
save_config_checksum
log_message "INFO" "SSH配置已备份到: $SSH_BACKUP_CONFIG"
fi
# 7. 启动SSH服务(带详细错误输出)
log_message "INFO" "启动SSH服务..."
local sshd_path=""
if [ -x "/usr/sbin/sshd" ]; then
sshd_path="/usr/sbin/sshd"
elif [ -x "/usr/bin/sshd" ]; then
sshd_path="/usr/bin/sshd"
else
log_ssh_error "ERROR" "找不到sshd可执行文件"
local startup_diag=$(collect_startup_diagnostics)
log_ssh_error "ERROR" "启动诊断信息: $startup_diag"
send_notification "SSH修复失败" "找不到sshd可执行文件。诊断: $startup_diag"
return 1
fi
# 尝试启动SSH服务(捕获详细错误)
log_message "INFO" "尝试启动SSH服务: $sshd_path"
local sshd_output=$(mktemp)
if ! $sshd_path 2> "$sshd_output"; then
log_ssh_error "ERROR" "SSH服务启动失败,详细错误信息:"
cat "$sshd_output" | while read line; do
log_ssh_error "ERROR" " $line"
done
log_ssh_error "ERROR" "SSH守护进程调试信息:"
local sshd_debug=$(capture_sshd_errors)
log_ssh_error "ERROR" "调试信息已保存到: $sshd_debug"
log_ssh_error "ERROR" "系统日志信息:"
local system_logs=$(collect_ssh_system_logs)
log_ssh_error "ERROR" "系统日志已保存到: $system_logs"
send_notification "SSH启动失败" "SSH服务启动失败,详细错误信息已记录到: $SSH_ERROR_LOG"
rm -f "$sshd_output"
return 1
fi
rm -f "$sshd_output"
sleep 3
# 8. 验证启动是否成功
if check_ssh_process && check_ssh_port; then
log_message "INFO" "SSH服务修复成功"
reset_backoff
# 显示最近的SSH错误日志(如果有)
if [ -f "$SSH_ERROR_LOG" ]; then
log_message "INFO" "最近的SSH错误日志:"
show_recent_ssh_errors 10
fi
send_notification "SSH服务恢复" "SSH服务已成功修复并重新启动"
# 报告状态到监控API
report_to_monitor_api "ssh_recovered" "SSH service has been successfully recovered"
return 0
else
log_message "ERROR" "SSH服务修复失败,应用指数退避"
apply_backoff
send_notification "SSH修复失败" "SSH服务修复失败,已应用退避策略等待重试。诊断信息: $diag_file"
# 报告状态到监控API
report_to_monitor_api "ssh_repair_failed" "SSH service repair failed, applying backoff"
return 1
fi
}
# 创建最小SSH配置
create_minimal_ssh_config() {
log_message "INFO" "创建最小SSH配置..."
cat > "$SSH_CONFIG" <<EOF
# 最小SSH配置(由看门狗自动生成)
Port $SSH_PORT
AddressFamily any
ListenAddress 0.0.0.0
ListenAddress ::
PermitRootLogin yes
PasswordAuthentication yes
PubkeyAuthentication yes
AuthorizedKeysFile .ssh/authorized_keys
UsePAM yes
PrintMotd no
AcceptEnv LANG LC_*
Subsystem sftp /usr/lib/openssh/sftp-server
EOF
log_message "INFO" "最小SSH配置已创建: $SSH_CONFIG"
}
# 创建默认SSH配置
create_default_ssh_config() {
log_message "WARN" "创建默认SSH配置..."
# 尝试使用系统默认配置
if [ -f "/etc/ssh/sshd_config.default" ]; then
cp "/etc/ssh/sshd_config.default" "$SSH_CONFIG"
else
# 创建基本配置
create_minimal_ssh_config
fi
}
# 报告状态到监控API
report_to_monitor_api() {
local status=$1
local message=$2
if [ -z "$MONITOR_API_URL" ]; then
return 0
fi
log_message "DEBUG" "报告状态到监控API: $status"
local payload=$(cat <<EOF
{
"event": "$status",
"message": "$message",
"timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
"service": "ssh_watchdog"
}
EOF
)
curl -s -X POST "$MONITOR_API_URL/events" \
-H 'Content-Type: application/json' \
-d "$payload" >> "$LOG_FILE" 2>&1
}
# 主监控循环
main_loop() {
log_message "INFO" "SSH服务看门狗启动,检查间隔: ${CHECK_INTERVAL}秒"
log_message "INFO" "配置: SSH端口=$SSH_PORT, 最大重试次数=$MAX_RETRIES, 通知=$([ "$NOTIFICATION_ENABLED" = "true" ] && echo "启用" || echo "禁用")"
log_message "INFO" "重启频率限制: $MAX_RESTARTS_PER_HOUR/小时, 指数退避: ${BACKOFF_BASE}s 基础, ${BACKOFF_MAX}s 最大"
local consecutive_failures=0
local total_checks=0
local last_notify_time=0
local notify_interval=3600 # 通知间隔:1小时
# 初始检查:确保SSH配置有备份和校验和
if [ -f "$SSH_CONFIG" ] && sshd -t 2>/dev/null; then
cp "$SSH_CONFIG" "$SSH_BACKUP_CONFIG" 2>/dev/null
save_config_checksum
log_message "INFO" "初始SSH配置备份和校验和已保存"
else
log_message "WARN" "初始SSH配置检查失败,将在首次修复时创建"
fi
while true; do
total_checks=$((total_checks + 1))
local current_time=$(date +%s)
local ssh_status="unknown"
# 1. 检查系统资源
if ! check_system_resources; then
log_message "WARN" "系统资源不足,可能影响SSH服务稳定性"
ssh_status="resource_warning"
fi
# 2. 检查SSH进程
if ! check_ssh_process; then
log_message "ERROR" "SSH进程不存在"
consecutive_failures=$((consecutive_failures + 1))
ssh_status="no_process"
else
# 3. 检查SSH端口
if ! check_ssh_port; then
log_message "ERROR" "SSH端口 $SSH_PORT 未监听"
consecutive_failures=$((consecutive_failures + 1))
ssh_status="port_not_listening"
else
# 4. 检查SSH响应
if ! check_ssh_response; then
log_message "WARN" "SSH端口监听但无响应"
consecutive_failures=$((consecutive_failures + 1))
ssh_status="no_response"
else
# SSH服务完全正常
ssh_status="healthy"
if [ $consecutive_failures -gt 0 ]; then
log_message "INFO" "SSH服务恢复正常 (之前连续失败 $consecutive_failures 次)"
send_notification "SSH服务恢复" "SSH服务已恢复正常 after $consecutive_failures 次失败"
report_to_monitor_api "ssh_healthy" "SSH service is healthy after $consecutive_failures failures"
fi
consecutive_failures=0
reset_backoff
fi
fi
fi
# 5. 如果连续失败次数超过阈值,尝试修复
if [ $consecutive_failures -ge $MAX_RETRIES ]; then
log_message "ERROR" "SSH服务连续失败 $consecutive_failures 次,开始修复..."
send_notification "SSH服务异常" "SSH服务连续失败 $consecutive_failures 次,开始自动修复"
if repair_ssh_service; then
consecutive_failures=0
log_message "INFO" "SSH服务修复成功,重置失败计数器"
send_notification "SSH修复成功" "SSH服务已成功修复并重新启动"
else
log_message "ERROR" "SSH服务修复失败,将在下次检查时重试"
# 收集详细诊断信息(仅在首次失败或间隔足够长时)
if [ $((current_time - last_notify_time)) -gt $notify_interval ]; then
local diag_file=$(collect_diagnostics)
send_notification "SSH修复失败" "SSH服务修复失败,诊断信息已保存到: $diag_file"
last_notify_time=$current_time
fi
# 不重置计数器,继续尝试
# 如果失败次数过多,尝试更多恢复策略
if [ $consecutive_failures -ge $((MAX_RETRIES * 3)) ]; then
log_message "ERROR" "SSH服务多次修复失败(已失败 $consecutive_failures 次),尝试强制恢复..."
force_recover_ssh
fi
fi
fi
# 6. 记录定期状态(每10个检查周期记录一次)
if [ $((total_checks % 10)) -eq 0 ]; then
log_message "INFO" "SSH服务状态: $ssh_status, 已连续监控 $total_checks 个周期, 连续失败: $consecutive_failures, 退避: $(( current_backoff - 1 )) 级"
report_to_monitor_api "ssh_status" "Status: $ssh_status, Checks: $total_checks, Failures: $consecutive_failures, Backoff: $(( current_backoff - 1 ))"
fi
# 7. 检查看门狗自身健康状态
if [ $((total_checks % 100)) -eq 0 ]; then
log_message "DEBUG" "看门狗运行正常,已执行 $total_checks 次检查,重启计数: $restart_count"
# 清理旧的诊断文件(保留最近5个)
cleanup_old_diagnostics
# 验证配置完整性
verify_config_integrity || log_message "WARN" "配置完整性验证失败"
fi
sleep $CHECK_INTERVAL
done
}
# 强制恢复SSH服务(更激进的恢复策略)
force_recover_ssh() {
log_message "WARN" "执行强制恢复策略..."
# 0. 检查重启频率
if ! check_restart_rate_limit; then
send_notification "SSH强制恢复失败" "重启频率超限,跳过强制恢复"
return 1
fi
# 记录详细的强制恢复日志
log_ssh_error "WARN" "开始强制恢复SSH服务..."
# 1. 杀死所有SSH相关进程
log_message "INFO" "强制终止所有SSH进程..."
log_ssh_error "INFO" "强制终止所有SSH进程"
killall -9 sshd 2>/dev/null || true
sleep 5
# 2. 清理可能损坏的SSH套接字和临时文件
log_message "INFO" "清理SSH相关临时文件..."
log_ssh_error "INFO" "清理SSH相关临时文件"
rm -f /tmp/ssh-* 2>/dev/null || true
rm -f /var/run/sshd.pid 2>/dev/null || true
# 3. 检查并创建必要的目录和文件
log_message "INFO" "检查SSH必要目录..."
log_ssh_error "INFO" "检查SSH必要目录和权限"
mkdir -p /var/run/sshd 2>/dev/null || true
mkdir -p /etc/ssh 2>/dev/null || true
chmod 755 /var/run/sshd 2>/dev/null || true
# 4. 尝试重新安装/修复SSH服务
if command -v apt-get &> /dev/null; then
log_message "INFO" "尝试重新配置SSH服务..."
log_ssh_error "INFO" "尝试重新配置SSH服务 (dpkg-reconfigure)"
dpkg-reconfigure -f noninteractive openssh-server 2>> "$SSH_ERROR_LOG" || true
# 如果仍然失败,尝试重新安装
if ! sshd -t 2>/dev/null; then
log_message "WARN" "尝试重新安装openssh-server..."
log_ssh_error "WARN" "配置测试失败,尝试重新安装openssh-server"
apt-get update -qq 2>> "$SSH_ERROR_LOG" || true
DEBIAN_FRONTEND=noninteractive apt-get install -y --reinstall openssh-server 2>> "$SSH_ERROR_LOG" || true
fi
elif command -v yum &> /dev/null; then
log_message "INFO" "尝试重新安装openssh-server (yum)..."
log_ssh_error "INFO" "尝试重新安装openssh-server (yum)"
yum reinstall -y openssh-server 2>> "$SSH_ERROR_LOG" || true
fi
# 5. 生成SSH主机密钥(如果不存在)
log_message "INFO" "检查SSH主机密钥..."
log_ssh_error "INFO" "检查并生成SSH主机密钥"
if [ ! -f "/etc/ssh/ssh_host_rsa_key" ]; then
log_message "INFO" "生成SSH RSA主机密钥..."
ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key -N '' 2>> "$SSH_ERROR_LOG" || true
fi
if [ ! -f "/etc/ssh/ssh_host_ecdsa_key" ]; then
log_message "INFO" "生成SSH ECDSA主机密钥..."
ssh-keygen -t ecdsa -f /etc/ssh/ssh_host_ecdsa_key -N '' 2>> "$SSH_ERROR_LOG" || true
fi
if [ ! -f "/etc/ssh/ssh_host_ed25519_key" ]; then
log_message "INFO" "生成SSH ED25519主机密钥..."
ssh-keygen -t ed25519 -f /etc/ssh/ssh_host_ed25519_key -N '' 2>> "$SSH_ERROR_LOG" || true
fi
# 6. 使用最保守的配置
log_message "INFO" "使用最保守的SSH配置..."
log_ssh_error "INFO" "使用最保守的SSH配置"
create_minimal_ssh_config
# 7. 验证配置(详细输出)
log_ssh_error "INFO" "验证SSH配置(详细模式)"
if ! sshd -t 2>> "$SSH_ERROR_LOG"; then
log_ssh_error "ERROR" "SSH配置验证失败,详细信息:"
local config_error_file=$(test_ssh_config_detailed)
log_ssh_error "ERROR" "配置测试详情已保存到: $config_error_file"
log_message "ERROR" "SSH配置验证失败,使用默认配置"
create_default_ssh_config
fi
# 8. 尝试启动SSH服务(带详细错误输出)
log_message "INFO" "强制启动SSH服务..."
log_ssh_error "INFO" "尝试启动SSH服务"
local sshd_path=""
if [ -x "/usr/sbin/sshd" ]; then
sshd_path="/usr/sbin/sshd"
elif [ -x "/usr/bin/sshd" ]; then
sshd_path="/usr/bin/sshd"
else
log_ssh_error "ERROR" "找不到sshd可执行文件,尝试重新安装..."
local startup_diag=$(collect_startup_diagnostics)
log_ssh_error "ERROR" "启动诊断信息: $startup_diag"
send_notification "SSH二进制文件丢失" "找不到sshd可执行文件。诊断: $startup_diag"
# 尝试重新安装
if command -v apt-get &> /dev/null; then
log_ssh_error "INFO" "尝试安装openssh-server"
DEBIAN_FRONTEND=noninteractive apt-get install -y openssh-server 2>> "$SSH_ERROR_LOG" || true
fi
# 再次检查
if [ -x "/usr/sbin/sshd" ]; then
sshd_path="/usr/sbin/sshd"
elif [ -x "/usr/bin/sshd" ]; then
sshd_path="/usr/bin/sshd"
else
log_ssh_error "ERROR" "重新安装后仍然找不到sshd,无法强制恢复"
local system_logs=$(collect_ssh_system_logs)
log_ssh_error "ERROR" "系统日志已保存到: $system_logs"
return 1
fi
fi
# 检查二进制文件完整性
if ! check_sshd_binary; then
log_ssh_error "ERROR" "sshd二进制文件检查失败"
local startup_diag=$(collect_startup_diagnostics)
log_ssh_error "ERROR" "启动诊断信息: $startup_diag"
send_notification "SSH二进制文件损坏" "sshd二进制文件检查失败。诊断: $startup_diag"
fi
# 尝试启动SSH服务(捕获详细错误)
log_ssh_error "INFO" "执行命令: $sshd_path"
local sshd_output=$(mktemp)
if ! $sshd_path 2> "$sshd_output"; then
log_ssh_error "ERROR" "SSH服务启动失败,详细错误信息:"
cat "$sshd_output" | while read line; do
log_ssh_error "ERROR" " $line"
done
log_ssh_error "ERROR" "SSH守护进程调试信息:"
local sshd_debug=$(capture_sshd_errors)
log_ssh_error "ERROR" "调试信息已保存到: $sshd_debug"
log_ssh_error "ERROR" "系统日志信息:"
local system_logs=$(collect_ssh_system_logs)
log_ssh_error "ERROR" "系统日志已保存到: $system_logs"
send_notification "SSH强制启动失败" "SSH服务强制启动失败,详细错误信息已记录到: $SSH_ERROR_LOG"
rm -f "$sshd_output"
else
log_ssh_error "INFO" "SSH服务启动成功(无错误输出)"
rm -f "$sshd_output"
fi
sleep 5
# 9. 验证恢复是否成功
if check_ssh_process && check_ssh_port; then
log_message "INFO" "强制恢复成功"
log_ssh_error "INFO" "强制恢复成功,SSH服务已正常运行"
reset_backoff
# 显示最近的SSH错误日志(如果有)
if [ -f "$SSH_ERROR_LOG" ]; then
log_message "INFO" "最近的SSH错误日志:"
show_recent_ssh_errors 10
fi
send_notification "SSH强制恢复成功" "SSH服务已通过强制恢复策略成功恢复"
# 保存新的配置校验和
save_config_checksum
return 0
else
log_message "ERROR" "强制恢复失败,应用指数退避"
log_ssh_error "ERROR" "强制恢复失败,应用指数退避"
apply_backoff
local startup_diag=$(collect_startup_diagnostics)
log_ssh_error "ERROR" "启动诊断信息: $startup_diag"
local system_logs=$(collect_ssh_system_logs)
log_ssh_error "ERROR" "系统日志已保存到: $system_logs"
send_notification "SSH强制恢复失败" "SSH服务强制恢复失败,详细错误信息已记录到: $SSH_ERROR_LOG"
return 1
fi
}
# 清理旧的诊断文件
cleanup_old_diagnostics() {
log_message "DEBUG" "清理旧的诊断文件..."
local diag_files=$(ls -t /tmp/ssh_diagnostics_*.log 2>/dev/null | tail -n +6)
if [ -n "$diag_files" ]; then
echo "$diag_files" | xargs rm -f 2>/dev/null || true
log_message "DEBUG" "已清理旧的诊断文件"
fi
}
# 信号处理:优雅退出
cleanup() {
log_message "INFO" "接收到终止信号,SSH服务看门狗开始清理..."
# 报告看门狗停止到监控API
report_to_monitor_api "watchdog_stopping" "SSH watchdog is stopping"
# 可选:在退出前确保SSH服务仍在运行
if ! check_ssh_process; then
log_message "WARN" "退出前检测到SSH服务未运行,尝试最后一次启动..."
if [ -x "/usr/sbin/sshd" ]; then
/usr/sbin/sshd 2>> "$LOG_FILE"
elif [ -x "/usr/bin/sshd" ]; then
/usr/bin/sshd 2>> "$LOG_FILE"
fi
fi
log_message "INFO" "SSH服务看门狗已退出"
exit 0
}
# 显示帮助信息
show_usage() {
cat <<EOF
SSH服务看门狗脚本 v4.0
用法: $0 [选项]
选项:
--help, -h 显示此帮助信息
--show-errors [行数] 显示最近的SSH错误日志(默认50行)
--show-log [行数] 显示看门狗主日志(默认50行)
--test-config 详细测试SSH配置文件
--capture-errors 捕获SSH守护进程的详细错误
--collect-logs 收集SSH相关的系统日志
--check-binary 检查SSH二进制文件完整性
--verify-config 验证SSH配置完整性
--status 显示SSH服务当前状态
示例:
$0 --show-errors 100 # 显示最近100条SSH错误日志
$0 --test-config # 详细测试SSH配置文件
$0 --status # 显示SSH服务状态
不带参数运行时,脚本将启动看门狗监控循环。
EOF
}
# 解析命令行参数
if [ $# -gt 0 ]; then
case "$1" in
--help|-h)
show_usage
exit 0
;;
--show-errors)
lines=${2:-50}
show_recent_ssh_errors "$lines"
exit 0
;;
--show-log)
lines=${2:-50}
echo "=== 看门狗主日志(最后 $lines 行)==="
if [ -f "$LOG_FILE" ]; then
tail -n "$lines" "$LOG_FILE"
else
echo "日志文件不存在: $LOG_FILE"
fi
exit 0
;;
--test-config)
echo "=== 详细测试SSH配置文件 ==="
test_ssh_config_detailed
exit 0
;;
--capture-errors)
echo "=== 捕获SSH守护进程错误 ==="
capture_sshd_errors
exit 0
;;
--collect-logs)
echo "=== 收集SSH系统日志 ==="
collect_ssh_system_logs
exit 0
;;
--check-binary)
echo "=== 检查SSH二进制文件 ==="
if check_sshd_binary; then
echo "✓ SSH二进制文件检查通过"
else
echo "✗ SSH二进制文件检查失败"
exit 1
fi
exit 0
;;
--verify-config)
echo "=== 验证SSH配置完整性 ==="
if verify_config_integrity; then
echo "✓ SSH配置完整性验证通过"
else
echo "✗ SSH配置完整性验证失败"
exit 1
fi
exit 0
;;
--status)
echo "=== SSH服务状态 ==="
echo -n "SSH进程: "
if check_ssh_process; then
echo "运行中 (PIDs: $(pgrep -x sshd | tr '\n' ' '))"
else
echo "未运行"
fi
echo -n "SSH端口 $SSH_PORT: "
if check_ssh_port; then
echo "监听中"
else
echo "未监听"
fi
echo -n "SSH响应: "
if check_ssh_response; then
echo "正常"
else
echo "无响应"
fi
echo ""
echo "=== 最近的错误日志(最后10行)==="
if [ -f "$SSH_ERROR_LOG" ]; then
tail -10 "$SSH_ERROR_LOG"
else
echo "无错误日志"
fi
exit 0
;;
*)
echo "未知选项: $1"
show_usage
exit 1
;;
esac
fi
# 注册信号处理
trap cleanup SIGTERM SIGINT SIGHUP
# 检查是否以root运行
if [ "$EUID" -ne 0 ]; then
echo "ERROR: 此脚本需要以root权限运行" >&2
exit 1
fi
# 创建日志文件目录
mkdir -p "$(dirname "$LOG_FILE")"
mkdir -p "$(dirname "$SSH_ERROR_LOG")"
touch "$LOG_FILE"
touch "$SSH_ERROR_LOG"
# 报告看门狗启动到监控API
report_to_monitor_api "watchdog_started" "SSH watchdog started with check interval ${CHECK_INTERVAL}s"
# 启动主循环
log_message "INFO" "=========================================="
log_message "INFO" "SSH服务看门狗即将启动主监控循环"
log_message "INFO" "=========================================="
log_message "INFO" "使用 '$0 --help' 查看可用命令"
log_message "INFO" "使用 '$0 --show-errors' 查看SSH错误日志"
main_loop