Spaces:
Running
Running
File size: 7,173 Bytes
020c94b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 | #!/usr/bin/env python3
"""
Hermes 自愈脚本 - 进程/内存/OOM/配置漂移检测
通过 cronjob 定期调用,异常时自动修复或告警
"""
import subprocess
import json
import os
import sys
from datetime import datetime
LOG_FILE = "/tmp/hermes-selfheal.log"
DATA_DIR = "/data/hermes"
def log(msg, level="INFO"):
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
line = f"[{ts}] [{level}] {msg}"
print(line)
try:
with open(LOG_FILE, "a") as f:
f.write(line + "\n")
except Exception:
pass
def check_memory():
"""检查内存使用,过高时自动清理"""
try:
result = subprocess.run(
["free", "-m"], capture_output=True, text=True, timeout=5
)
if result.returncode != 0:
return
lines = result.stdout.strip().split("\n")
parts = lines[1].split()
used_mb = int(parts[2])
total_mb = int(parts[1])
percent = round(used_mb / total_mb * 100, 1)
log(f"内存: {used_mb}/{total_mb}MB ({percent}%)")
if percent > 90:
log("内存超过 90%,执行清理", "WARN")
cleanup_actions = []
# 清理旧日志
for log_dir in ["/data/hermes/logs", "/tmp/hermes/logs", "/app/logs"]:
if os.path.exists(log_dir):
try:
result = subprocess.run(
["find", log_dir, "-name", "*.log", "-mtime", "+7", "-delete"],
capture_output=True, text=True, timeout=10,
)
cleanup_actions.append(f"清理 {log_dir} 7天前日志")
except Exception as e:
log(f"清理日志失败: {e}", "ERROR")
# 清理 pip 缓存
try:
subprocess.run(
["pip", "cache", "purge"],
capture_output=True, text=True, timeout=10,
)
cleanup_actions.append("清理 pip 缓存")
except Exception:
pass
# 清理 /tmp 旧文件
try:
subprocess.run(
["find", "/tmp", "-type", "f", "-mtime", "+3", "-delete"],
capture_output=True, text=True, timeout=10,
)
cleanup_actions.append("清理 /tmp 3天前文件")
except Exception:
pass
log(f"清理完成: {'; '.join(cleanup_actions)}")
elif percent > 85:
log("内存超过 85%,建议关注", "WARN")
except Exception as e:
log(f"内存检查失败: {e}", "ERROR")
def check_disk():
"""检查磁盘使用"""
try:
result = subprocess.run(
["df", "-m", "/data"], capture_output=True, text=True, timeout=5
)
if result.returncode != 0:
return
lines = result.stdout.strip().split("\n")
if len(lines) < 2:
return
parts = lines[1].split()
used_mb = int(parts[2])
total_mb = int(parts[1])
percent = round(used_mb / total_mb * 100, 1)
log(f"磁盘: {used_mb}/{total_mb}MB ({percent}%)")
if percent > 90:
log("磁盘超过 90%,清理旧数据", "WARN")
for old_dir in ["/data/hermes/logs", "/data/hermes/uploads"]:
if os.path.exists(old_dir):
subprocess.run(
["find", old_dir, "-type", "f", "-mtime", "+14", "-delete"],
capture_output=True, text=True, timeout=15,
)
except Exception as e:
log(f"磁盘检查失败: {e}", "ERROR")
def check_process():
"""检查 Hermes 进程状态"""
try:
# 检查 Python 进程(Gateway)
result = subprocess.run(
["pgrep", "-f", "entry.py"], capture_output=True, text=True, timeout=5
)
gateway_running = result.returncode == 0
# 检查 Dashboard
result = subprocess.run(
["pgrep", "-f", "7860"], capture_output=True, text=True, timeout=5
)
dashboard_running = result.returncode == 0
log(f"Gateway: {'运行中' if gateway_running else '未运行'}")
log(f"Dashboard: {'运行中' if dashboard_running else '未运行'}")
if not gateway_running:
log("Gateway 未运行!", "ERROR")
# 尝试重启
try:
subprocess.run(
["bash", "/app/start.sh"],
capture_output=True, text=True, timeout=30,
)
log("已尝试重启 Gateway", "WARN")
except Exception as e:
log(f"重启失败: {e}", "ERROR")
except Exception as e:
log(f"进程检查失败: {e}", "ERROR")
def check_config_drift():
"""检查配置文件是否被意外修改"""
import hashlib
config_files = {
"SOUL.md": "/app/SOUL.md",
"config.yaml": "/app/config.yaml",
}
hash_file = os.path.join(DATA_DIR, ".config_hashes.json")
try:
saved_hashes = {}
if os.path.exists(hash_file):
with open(hash_file, "r") as f:
saved_hashes = json.load(f)
current_hashes = {}
for name, path in config_files.items():
if os.path.exists(path):
with open(path, "rb") as f:
current_hashes[name] = hashlib.md5(f.read()).hexdigest()
drift = {}
for name, h in current_hashes.items():
if name in saved_hashes and saved_hashes[name] != h:
drift[name] = f"hash changed from {saved_hashes[name][:8]} to {h[:8]}"
if drift:
log(f"配置漂移检测: {drift}", "WARN")
else:
log("配置文件无漂移")
# 更新保存的 hash
with open(hash_file, "w") as f:
json.dump(current_hashes, f, indent=2)
except Exception as e:
log(f"配置漂移检测失败: {e}", "ERROR")
def check_feishu_connection():
"""检查飞书 WebSocket 连接"""
try:
result = subprocess.run(
["pgrep", "-f", "websocket"], capture_output=True, text=True, timeout=5
)
connected = result.returncode == 0
log(f"飞书 WebSocket: {'已连接' if connected else '可能断开'}")
if not connected:
log("飞书连接可能断开,建议检查", "WARN")
except Exception as e:
log(f"飞书连接检查失败: {e}", "ERROR")
def main():
log("=" * 40)
log("自愈检查启动")
# 日志轮转
try:
if os.path.exists(LOG_FILE):
size = os.path.getsize(LOG_FILE)
if size > 1024 * 100: # 100KB
os.rename(LOG_FILE, LOG_FILE + ".bak")
log("日志轮转完成")
except Exception:
pass
check_process()
check_memory()
check_disk()
check_config_drift()
check_feishu_connection()
log("自愈检查完成")
log("=" * 40)
if __name__ == "__main__":
main()
|