Spaces:
Running
Running
Z User commited on
Commit ·
300ab73
1
Parent(s): 11dc522
fix: add gateway watchdog + stale PID cleanup + retry logic
Browse files- Clean stale gateway.pid/lock files on startup (prevents 'already running' false positive)
- Retry gateway start up to 3 times with diagnostic log output on failure
- Extend startup timeout from 60s to 90s
- Add background watchdog that auto-restarts gateway every 30s if health check fails
- Watchdog cleans stale state before each restart attempt
start.sh
CHANGED
|
@@ -20,6 +20,13 @@ done
|
|
| 20 |
|
| 21 |
echo "Persistent storage ready."
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
# Initialize MemPalace if not already
|
| 24 |
PALACE_PATH="${MEMPALACE_PALACE_PATH:-/data/hermes/palace}"
|
| 25 |
if [ ! -f "$PALACE_PATH/.palace_initialized" ]; then
|
|
@@ -31,25 +38,66 @@ else
|
|
| 31 |
echo "MemPalace already initialized."
|
| 32 |
fi
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
# Start Hermes Gateway (aiohttp API server on :8642 + Feishu platform)
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
echo "Waiting for Gateway to start..."
|
| 43 |
-
for i in $(seq 1 30); do
|
| 44 |
-
if curl -s http://127.0.0.1:8642/health > /dev/null 2>&1; then
|
| 45 |
-
echo "Gateway is ready on :8642"
|
| 46 |
break
|
| 47 |
fi
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
done
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
# Start hermes-web-ui Node.js BFF server on :6060
|
| 52 |
-
echo "Starting hermes-web-ui BFF..."
|
| 53 |
export PORT=6060
|
| 54 |
export UPSTREAM=http://127.0.0.1:8642
|
| 55 |
export HERMES_HOME=/root/.hermes
|
|
@@ -59,13 +107,13 @@ export NODE_ENV=production
|
|
| 59 |
cd /app/webui-server
|
| 60 |
node index.js >> /data/hermes/logs/webui.log 2>&1 &
|
| 61 |
WEBUI_PID=$!
|
| 62 |
-
echo "WebUI BFF PID: $WEBUI_PID"
|
| 63 |
|
| 64 |
# Wait for WebUI BFF to be ready
|
| 65 |
-
echo "Waiting for WebUI BFF to start..."
|
| 66 |
for i in $(seq 1 30); do
|
| 67 |
if curl -s http://127.0.0.1:6060/health > /dev/null 2>&1; then
|
| 68 |
-
echo "WebUI BFF is ready on :6060"
|
| 69 |
break
|
| 70 |
fi
|
| 71 |
sleep 2
|
|
@@ -79,5 +127,39 @@ echo " Proxy: http://0.0.0.0:7860"
|
|
| 79 |
echo " Auth Token: $AUTH_TOKEN"
|
| 80 |
echo ""
|
| 81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
# Start Python proxy on :7860 (main HF Space port)
|
| 83 |
exec python3 /app/entry.py
|
|
|
|
| 20 |
|
| 21 |
echo "Persistent storage ready."
|
| 22 |
|
| 23 |
+
# ── Clean up stale PID/lock files from previous crash ──
|
| 24 |
+
echo "Cleaning up stale state..."
|
| 25 |
+
rm -f "$HERMES_HOME/gateway.pid" 2>/dev/null
|
| 26 |
+
rm -f "$HERMES_HOME/.gateway_runtime_lock" 2>/dev/null
|
| 27 |
+
rm -f "$HERMES_HOME/.gateway_takeover" 2>/dev/null
|
| 28 |
+
echo "Stale state cleaned."
|
| 29 |
+
|
| 30 |
# Initialize MemPalace if not already
|
| 31 |
PALACE_PATH="${MEMPALACE_PALACE_PATH:-/data/hermes/palace}"
|
| 32 |
if [ ! -f "$PALACE_PATH/.palace_initialized" ]; then
|
|
|
|
| 38 |
echo "MemPalace already initialized."
|
| 39 |
fi
|
| 40 |
|
| 41 |
+
# ── Gateway management functions ──
|
| 42 |
+
start_gateway() {
|
| 43 |
+
# Clean stale PID/lock before each start attempt
|
| 44 |
+
rm -f "$HERMES_HOME/gateway.pid" 2>/dev/null
|
| 45 |
+
rm -f "$HERMES_HOME/.gateway_runtime_lock" 2>/dev/null
|
| 46 |
+
rm -f "$HERMES_HOME/.gateway_takeover" 2>/dev/null
|
| 47 |
+
|
| 48 |
+
echo "[$(date)] Starting Hermes Gateway..."
|
| 49 |
+
PYTHONUNBUFFERED=1 HERMES_ACCEPT_HOOKS=1 python3 -u -m hermes_cli.main gateway run -v \
|
| 50 |
+
>> /data/hermes/logs/gateway.log 2>&1 &
|
| 51 |
+
echo "$!" > /tmp/hermes-gateway.pid
|
| 52 |
+
echo "[$(date)] Gateway PID: $(cat /tmp/hermes-gateway.pid)"
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
wait_for_gateway() {
|
| 56 |
+
echo "[$(date)] Waiting for Gateway to start..."
|
| 57 |
+
for i in $(seq 1 45); do
|
| 58 |
+
if curl -s http://127.0.0.1:8642/health > /dev/null 2>&1; then
|
| 59 |
+
echo "[$(date)] Gateway is ready on :8642"
|
| 60 |
+
return 0
|
| 61 |
+
fi
|
| 62 |
+
# Check if process is still alive
|
| 63 |
+
if [ -f /tmp/hermes-gateway.pid ]; then
|
| 64 |
+
if ! kill -0 "$(cat /tmp/hermes-gateway.pid)" 2>/dev/null; then
|
| 65 |
+
echo "[$(date)] Gateway process died during startup (attempt $i/45)"
|
| 66 |
+
echo "[$(date)] --- Last 80 lines of gateway.log ---"
|
| 67 |
+
tail -80 /data/hermes/logs/gateway.log
|
| 68 |
+
echo "[$(date)] --- End gateway.log ---"
|
| 69 |
+
return 1
|
| 70 |
+
fi
|
| 71 |
+
fi
|
| 72 |
+
sleep 2
|
| 73 |
+
done
|
| 74 |
+
echo "[$(date)] Gateway startup timed out after 90s"
|
| 75 |
+
return 1
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
# Start Hermes Gateway (aiohttp API server on :8642 + Feishu platform)
|
| 79 |
+
start_gateway
|
| 80 |
+
|
| 81 |
+
# Wait for gateway to be ready, with retries
|
| 82 |
+
GATEWAY_OK=false
|
| 83 |
+
for attempt in 1 2 3; do
|
| 84 |
+
if wait_for_gateway; then
|
| 85 |
+
GATEWAY_OK=true
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
break
|
| 87 |
fi
|
| 88 |
+
if [ $attempt -lt 3 ]; then
|
| 89 |
+
echo "[$(date)] Retrying gateway start (attempt $((attempt+1))/3)..."
|
| 90 |
+
sleep 5
|
| 91 |
+
start_gateway
|
| 92 |
+
fi
|
| 93 |
done
|
| 94 |
|
| 95 |
+
if [ "$GATEWAY_OK" = false ]; then
|
| 96 |
+
echo "[$(date)] WARNING: Gateway failed after 3 attempts. Watchdog will keep retrying."
|
| 97 |
+
fi
|
| 98 |
+
|
| 99 |
# Start hermes-web-ui Node.js BFF server on :6060
|
| 100 |
+
echo "[$(date)] Starting hermes-web-ui BFF..."
|
| 101 |
export PORT=6060
|
| 102 |
export UPSTREAM=http://127.0.0.1:8642
|
| 103 |
export HERMES_HOME=/root/.hermes
|
|
|
|
| 107 |
cd /app/webui-server
|
| 108 |
node index.js >> /data/hermes/logs/webui.log 2>&1 &
|
| 109 |
WEBUI_PID=$!
|
| 110 |
+
echo "[$(date)] WebUI BFF PID: $WEBUI_PID"
|
| 111 |
|
| 112 |
# Wait for WebUI BFF to be ready
|
| 113 |
+
echo "[$(date)] Waiting for WebUI BFF to start..."
|
| 114 |
for i in $(seq 1 30); do
|
| 115 |
if curl -s http://127.0.0.1:6060/health > /dev/null 2>&1; then
|
| 116 |
+
echo "[$(date)] WebUI BFF is ready on :6060"
|
| 117 |
break
|
| 118 |
fi
|
| 119 |
sleep 2
|
|
|
|
| 127 |
echo " Auth Token: $AUTH_TOKEN"
|
| 128 |
echo ""
|
| 129 |
|
| 130 |
+
# ── Start Gateway watchdog in background ──
|
| 131 |
+
(
|
| 132 |
+
echo "[$(date)] Gateway watchdog started (check interval: 30s)"
|
| 133 |
+
while true; do
|
| 134 |
+
sleep 30
|
| 135 |
+
# Check if gateway health endpoint responds
|
| 136 |
+
if ! curl -s --max-time 5 http://127.0.0.1:8642/health > /dev/null 2>&1; then
|
| 137 |
+
echo "[$(date)] Gateway health check FAILED — restarting..."
|
| 138 |
+
# Kill stale process if any
|
| 139 |
+
if [ -f /tmp/hermes-gateway.pid ]; then
|
| 140 |
+
OLD_PID=$(cat /tmp/hermes-gateway.pid)
|
| 141 |
+
kill -9 "$OLD_PID" 2>/dev/null || true
|
| 142 |
+
fi
|
| 143 |
+
# Append diagnostic info to log
|
| 144 |
+
echo "--- Gateway watchdog restart $(date) ---" >> /data/hermes/logs/gateway.log
|
| 145 |
+
# Clean stale state
|
| 146 |
+
rm -f "$HERMES_HOME/gateway.pid" 2>/dev/null
|
| 147 |
+
rm -f "$HERMES_HOME/.gateway_runtime_lock" 2>/dev/null
|
| 148 |
+
rm -f "$HERMES_HOME/.gateway_takeover" 2>/dev/null
|
| 149 |
+
# Restart
|
| 150 |
+
start_gateway
|
| 151 |
+
# Brief wait then verify
|
| 152 |
+
sleep 15
|
| 153 |
+
if curl -s --max-time 5 http://127.0.0.1:8642/health > /dev/null 2>&1; then
|
| 154 |
+
echo "[$(date)] Gateway restarted successfully"
|
| 155 |
+
else
|
| 156 |
+
echo "[$(date)] Gateway restart failed — will retry in 30s"
|
| 157 |
+
fi
|
| 158 |
+
fi
|
| 159 |
+
done
|
| 160 |
+
) &
|
| 161 |
+
WATCHDOG_PID=$!
|
| 162 |
+
echo "[$(date)] Watchdog PID: $WATCHDOG_PID"
|
| 163 |
+
|
| 164 |
# Start Python proxy on :7860 (main HF Space port)
|
| 165 |
exec python3 /app/entry.py
|