#!/usr/bin/env python3 """ Hermes 自愈脚本 - 进程/内存/OOM/配置漂移检测 通过 cronjob 定期调用,异常时自动修复或告警 """ import subprocess import json import os import sys from datetime import datetime LOG_FILE = "/tmp/hermes-selfheal.log" DATA_DIR = "/data/hermes" def log(msg, level="INFO"): ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") line = f"[{ts}] [{level}] {msg}" print(line) try: with open(LOG_FILE, "a") as f: f.write(line + "\n") except Exception: pass def check_memory(): """检查内存使用,过高时自动清理""" try: result = subprocess.run( ["free", "-m"], capture_output=True, text=True, timeout=5 ) if result.returncode != 0: return lines = result.stdout.strip().split("\n") parts = lines[1].split() used_mb = int(parts[2]) total_mb = int(parts[1]) percent = round(used_mb / total_mb * 100, 1) log(f"内存: {used_mb}/{total_mb}MB ({percent}%)") if percent > 90: log("内存超过 90%,执行清理", "WARN") cleanup_actions = [] # 清理旧日志 for log_dir in ["/data/hermes/logs", "/tmp/hermes/logs", "/app/logs"]: if os.path.exists(log_dir): try: result = subprocess.run( ["find", log_dir, "-name", "*.log", "-mtime", "+7", "-delete"], capture_output=True, text=True, timeout=10, ) cleanup_actions.append(f"清理 {log_dir} 7天前日志") except Exception as e: log(f"清理日志失败: {e}", "ERROR") # 清理 pip 缓存 try: subprocess.run( ["pip", "cache", "purge"], capture_output=True, text=True, timeout=10, ) cleanup_actions.append("清理 pip 缓存") except Exception: pass # 清理 /tmp 旧文件 try: subprocess.run( ["find", "/tmp", "-type", "f", "-mtime", "+3", "-delete"], capture_output=True, text=True, timeout=10, ) cleanup_actions.append("清理 /tmp 3天前文件") except Exception: pass log(f"清理完成: {'; '.join(cleanup_actions)}") elif percent > 85: log("内存超过 85%,建议关注", "WARN") except Exception as e: log(f"内存检查失败: {e}", "ERROR") def check_disk(): """检查磁盘使用""" try: result = subprocess.run( ["df", "-m", "/data"], capture_output=True, text=True, timeout=5 ) if result.returncode != 0: return lines = result.stdout.strip().split("\n") if len(lines) < 2: return parts = lines[1].split() used_mb = int(parts[2]) total_mb = int(parts[1]) percent = round(used_mb / total_mb * 100, 1) log(f"磁盘: {used_mb}/{total_mb}MB ({percent}%)") if percent > 90: log("磁盘超过 90%,清理旧数据", "WARN") for old_dir in ["/data/hermes/logs", "/data/hermes/uploads"]: if os.path.exists(old_dir): subprocess.run( ["find", old_dir, "-type", "f", "-mtime", "+14", "-delete"], capture_output=True, text=True, timeout=15, ) except Exception as e: log(f"磁盘检查失败: {e}", "ERROR") def check_process(): """检查 Hermes 进程状态""" try: # 检查 Python 进程(Gateway) result = subprocess.run( ["pgrep", "-f", "entry.py"], capture_output=True, text=True, timeout=5 ) gateway_running = result.returncode == 0 # 检查 Dashboard result = subprocess.run( ["pgrep", "-f", "7860"], capture_output=True, text=True, timeout=5 ) dashboard_running = result.returncode == 0 log(f"Gateway: {'运行中' if gateway_running else '未运行'}") log(f"Dashboard: {'运行中' if dashboard_running else '未运行'}") if not gateway_running: log("Gateway 未运行!", "ERROR") # 尝试重启 try: subprocess.run( ["bash", "/app/start.sh"], capture_output=True, text=True, timeout=30, ) log("已尝试重启 Gateway", "WARN") except Exception as e: log(f"重启失败: {e}", "ERROR") except Exception as e: log(f"进程检查失败: {e}", "ERROR") def check_config_drift(): """检查配置文件是否被意外修改""" import hashlib config_files = { "SOUL.md": "/app/SOUL.md", "config.yaml": "/app/config.yaml", } hash_file = os.path.join(DATA_DIR, ".config_hashes.json") try: saved_hashes = {} if os.path.exists(hash_file): with open(hash_file, "r") as f: saved_hashes = json.load(f) current_hashes = {} for name, path in config_files.items(): if os.path.exists(path): with open(path, "rb") as f: current_hashes[name] = hashlib.md5(f.read()).hexdigest() drift = {} for name, h in current_hashes.items(): if name in saved_hashes and saved_hashes[name] != h: drift[name] = f"hash changed from {saved_hashes[name][:8]} to {h[:8]}" if drift: log(f"配置漂移检测: {drift}", "WARN") else: log("配置文件无漂移") # 更新保存的 hash with open(hash_file, "w") as f: json.dump(current_hashes, f, indent=2) except Exception as e: log(f"配置漂移检测失败: {e}", "ERROR") def check_feishu_connection(): """检查飞书 WebSocket 连接""" try: result = subprocess.run( ["pgrep", "-f", "websocket"], capture_output=True, text=True, timeout=5 ) connected = result.returncode == 0 log(f"飞书 WebSocket: {'已连接' if connected else '可能断开'}") if not connected: log("飞书连接可能断开,建议检查", "WARN") except Exception as e: log(f"飞书连接检查失败: {e}", "ERROR") def main(): log("=" * 40) log("自愈检查启动") # 日志轮转 try: if os.path.exists(LOG_FILE): size = os.path.getsize(LOG_FILE) if size > 1024 * 100: # 100KB os.rename(LOG_FILE, LOG_FILE + ".bak") log("日志轮转完成") except Exception: pass check_process() check_memory() check_disk() check_config_drift() check_feishu_connection() log("自愈检查完成") log("=" * 40) if __name__ == "__main__": main()