Spaces:
Running
Running
Z User commited on
Commit ·
7cae251
1
Parent(s): 6414e6b
fix: bind api_server to 127.0.0.1 + fix watchdog process detection
Browse filesRoot cause: api_server refused to bind 0.0.0.0 without API_SERVER_KEY,
so the /health endpoint on :8642 was never available. But the Gateway
process WAS running and connected to Feishu WebSocket.
Fix 1: Change api_server host from 0.0.0.0 to 127.0.0.1 (only local
access needed since entry.py proxy handles external requests)
Fix 2: Watchdog now checks process liveness (kill -0 PID) instead of
HTTP health check, preventing false restart of a running gateway
- config.yaml +1 -1
- start.sh +21 -14
config.yaml
CHANGED
|
@@ -13,7 +13,7 @@ platforms:
|
|
| 13 |
api_server:
|
| 14 |
enabled: true
|
| 15 |
extra:
|
| 16 |
-
host:
|
| 17 |
port: 8642
|
| 18 |
cors_origins: "*"
|
| 19 |
memory:
|
|
|
|
| 13 |
api_server:
|
| 14 |
enabled: true
|
| 15 |
extra:
|
| 16 |
+
host: 127.0.0.1
|
| 17 |
port: 8642
|
| 18 |
cors_origins: "*"
|
| 19 |
memory:
|
start.sh
CHANGED
|
@@ -129,31 +129,38 @@ echo ""
|
|
| 129 |
|
| 130 |
# ── Start Gateway watchdog in background ──
|
| 131 |
(
|
| 132 |
-
echo "[$(date)] Gateway watchdog started (check interval:
|
| 133 |
while true; do
|
| 134 |
-
sleep
|
| 135 |
-
# Check if gateway
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
fi
|
| 143 |
-
|
| 144 |
-
|
|
|
|
|
|
|
| 145 |
# Clean stale state
|
| 146 |
rm -f "$HERMES_HOME/gateway.pid" 2>/dev/null
|
| 147 |
rm -f "$HERMES_HOME/.gateway_runtime_lock" 2>/dev/null
|
| 148 |
rm -f "$HERMES_HOME/.gateway_takeover" 2>/dev/null
|
| 149 |
# Restart
|
| 150 |
start_gateway
|
| 151 |
-
# Brief wait then verify
|
| 152 |
sleep 15
|
| 153 |
-
if
|
| 154 |
echo "[$(date)] Gateway restarted successfully"
|
| 155 |
else
|
| 156 |
-
echo "[$(date)] Gateway restart failed — will retry in
|
| 157 |
fi
|
| 158 |
fi
|
| 159 |
done
|
|
|
|
| 129 |
|
| 130 |
# ── Start Gateway watchdog in background ──
|
| 131 |
(
|
| 132 |
+
echo "[$(date)] Gateway watchdog started (check interval: 60s)"
|
| 133 |
while true; do
|
| 134 |
+
sleep 60
|
| 135 |
+
# Check if gateway process is still alive
|
| 136 |
+
GW_ALIVE=false
|
| 137 |
+
if [ -f /tmp/hermes-gateway.pid ]; then
|
| 138 |
+
if kill -0 "$(cat /tmp/hermes-gateway.pid)" 2>/dev/null; then
|
| 139 |
+
GW_ALIVE=true
|
| 140 |
+
fi
|
| 141 |
+
fi
|
| 142 |
+
# Also check via gateway.pid in hermes home
|
| 143 |
+
if [ "$GW_ALIVE" = false ] && [ -f "$HERMES_HOME/gateway.pid" ]; then
|
| 144 |
+
if kill -0 "$(cat "$HERMES_HOME/gateway.pid")" 2>/dev/null; then
|
| 145 |
+
GW_ALIVE=true
|
| 146 |
+
# Sync the PID file
|
| 147 |
+
cp "$HERMES_HOME/gateway.pid" /tmp/hermes-gateway.pid
|
| 148 |
fi
|
| 149 |
+
fi
|
| 150 |
+
|
| 151 |
+
if [ "$GW_ALIVE" = false ]; then
|
| 152 |
+
echo "[$(date)] Gateway process is DEAD — restarting..."
|
| 153 |
# Clean stale state
|
| 154 |
rm -f "$HERMES_HOME/gateway.pid" 2>/dev/null
|
| 155 |
rm -f "$HERMES_HOME/.gateway_runtime_lock" 2>/dev/null
|
| 156 |
rm -f "$HERMES_HOME/.gateway_takeover" 2>/dev/null
|
| 157 |
# Restart
|
| 158 |
start_gateway
|
|
|
|
| 159 |
sleep 15
|
| 160 |
+
if [ -f /tmp/hermes-gateway.pid ] && kill -0 "$(cat /tmp/hermes-gateway.pid)" 2>/dev/null; then
|
| 161 |
echo "[$(date)] Gateway restarted successfully"
|
| 162 |
else
|
| 163 |
+
echo "[$(date)] Gateway restart failed — will retry in 60s"
|
| 164 |
fi
|
| 165 |
fi
|
| 166 |
done
|