File size: 7,173 Bytes
020c94b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
#!/usr/bin/env python3
"""
Hermes 自愈脚本 - 进程/内存/OOM/配置漂移检测
通过 cronjob 定期调用,异常时自动修复或告警
"""

import subprocess
import json
import os
import sys
from datetime import datetime

LOG_FILE = "/tmp/hermes-selfheal.log"
DATA_DIR = "/data/hermes"


def log(msg, level="INFO"):
    ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    line = f"[{ts}] [{level}] {msg}"
    print(line)
    try:
        with open(LOG_FILE, "a") as f:
            f.write(line + "\n")
    except Exception:
        pass


def check_memory():
    """检查内存使用,过高时自动清理"""
    try:
        result = subprocess.run(
            ["free", "-m"], capture_output=True, text=True, timeout=5
        )
        if result.returncode != 0:
            return

        lines = result.stdout.strip().split("\n")
        parts = lines[1].split()
        used_mb = int(parts[2])
        total_mb = int(parts[1])
        percent = round(used_mb / total_mb * 100, 1)

        log(f"内存: {used_mb}/{total_mb}MB ({percent}%)")

        if percent > 90:
            log("内存超过 90%,执行清理", "WARN")
            cleanup_actions = []

            # 清理旧日志
            for log_dir in ["/data/hermes/logs", "/tmp/hermes/logs", "/app/logs"]:
                if os.path.exists(log_dir):
                    try:
                        result = subprocess.run(
                            ["find", log_dir, "-name", "*.log", "-mtime", "+7", "-delete"],
                            capture_output=True, text=True, timeout=10,
                        )
                        cleanup_actions.append(f"清理 {log_dir} 7天前日志")
                    except Exception as e:
                        log(f"清理日志失败: {e}", "ERROR")

            # 清理 pip 缓存
            try:
                subprocess.run(
                    ["pip", "cache", "purge"],
                    capture_output=True, text=True, timeout=10,
                )
                cleanup_actions.append("清理 pip 缓存")
            except Exception:
                pass

            # 清理 /tmp 旧文件
            try:
                subprocess.run(
                    ["find", "/tmp", "-type", "f", "-mtime", "+3", "-delete"],
                    capture_output=True, text=True, timeout=10,
                )
                cleanup_actions.append("清理 /tmp 3天前文件")
            except Exception:
                pass

            log(f"清理完成: {'; '.join(cleanup_actions)}")

        elif percent > 85:
            log("内存超过 85%,建议关注", "WARN")

    except Exception as e:
        log(f"内存检查失败: {e}", "ERROR")


def check_disk():
    """检查磁盘使用"""
    try:
        result = subprocess.run(
            ["df", "-m", "/data"], capture_output=True, text=True, timeout=5
        )
        if result.returncode != 0:
            return

        lines = result.stdout.strip().split("\n")
        if len(lines) < 2:
            return

        parts = lines[1].split()
        used_mb = int(parts[2])
        total_mb = int(parts[1])
        percent = round(used_mb / total_mb * 100, 1)

        log(f"磁盘: {used_mb}/{total_mb}MB ({percent}%)")

        if percent > 90:
            log("磁盘超过 90%,清理旧数据", "WARN")
            for old_dir in ["/data/hermes/logs", "/data/hermes/uploads"]:
                if os.path.exists(old_dir):
                    subprocess.run(
                        ["find", old_dir, "-type", "f", "-mtime", "+14", "-delete"],
                        capture_output=True, text=True, timeout=15,
                    )

    except Exception as e:
        log(f"磁盘检查失败: {e}", "ERROR")


def check_process():
    """检查 Hermes 进程状态"""
    try:
        # 检查 Python 进程(Gateway)
        result = subprocess.run(
            ["pgrep", "-f", "entry.py"], capture_output=True, text=True, timeout=5
        )
        gateway_running = result.returncode == 0

        # 检查 Dashboard
        result = subprocess.run(
            ["pgrep", "-f", "7860"], capture_output=True, text=True, timeout=5
        )
        dashboard_running = result.returncode == 0

        log(f"Gateway: {'运行中' if gateway_running else '未运行'}")
        log(f"Dashboard: {'运行中' if dashboard_running else '未运行'}")

        if not gateway_running:
            log("Gateway 未运行!", "ERROR")
            # 尝试重启
            try:
                subprocess.run(
                    ["bash", "/app/start.sh"],
                    capture_output=True, text=True, timeout=30,
                )
                log("已尝试重启 Gateway", "WARN")
            except Exception as e:
                log(f"重启失败: {e}", "ERROR")

    except Exception as e:
        log(f"进程检查失败: {e}", "ERROR")


def check_config_drift():
    """检查配置文件是否被意外修改"""
    import hashlib

    config_files = {
        "SOUL.md": "/app/SOUL.md",
        "config.yaml": "/app/config.yaml",
    }

    hash_file = os.path.join(DATA_DIR, ".config_hashes.json")

    try:
        saved_hashes = {}
        if os.path.exists(hash_file):
            with open(hash_file, "r") as f:
                saved_hashes = json.load(f)

        current_hashes = {}
        for name, path in config_files.items():
            if os.path.exists(path):
                with open(path, "rb") as f:
                    current_hashes[name] = hashlib.md5(f.read()).hexdigest()

        drift = {}
        for name, h in current_hashes.items():
            if name in saved_hashes and saved_hashes[name] != h:
                drift[name] = f"hash changed from {saved_hashes[name][:8]} to {h[:8]}"

        if drift:
            log(f"配置漂移检测: {drift}", "WARN")
        else:
            log("配置文件无漂移")

        # 更新保存的 hash
        with open(hash_file, "w") as f:
            json.dump(current_hashes, f, indent=2)

    except Exception as e:
        log(f"配置漂移检测失败: {e}", "ERROR")


def check_feishu_connection():
    """检查飞书 WebSocket 连接"""
    try:
        result = subprocess.run(
            ["pgrep", "-f", "websocket"], capture_output=True, text=True, timeout=5
        )
        connected = result.returncode == 0
        log(f"飞书 WebSocket: {'已连接' if connected else '可能断开'}")

        if not connected:
            log("飞书连接可能断开,建议检查", "WARN")

    except Exception as e:
        log(f"飞书连接检查失败: {e}", "ERROR")


def main():
    log("=" * 40)
    log("自愈检查启动")

    # 日志轮转
    try:
        if os.path.exists(LOG_FILE):
            size = os.path.getsize(LOG_FILE)
            if size > 1024 * 100:  # 100KB
                os.rename(LOG_FILE, LOG_FILE + ".bak")
                log("日志轮转完成")
    except Exception:
        pass

    check_process()
    check_memory()
    check_disk()
    check_config_drift()
    check_feishu_connection()

    log("自愈检查完成")
    log("=" * 40)


if __name__ == "__main__":
    main()