Z User commited on
Commit
300ab73
·
1 Parent(s): 11dc522

fix: add gateway watchdog + stale PID cleanup + retry logic

Browse files

- Clean stale gateway.pid/lock files on startup (prevents 'already running' false positive)
- Retry gateway start up to 3 times with diagnostic log output on failure
- Extend startup timeout from 60s to 90s
- Add background watchdog that auto-restarts gateway every 30s if health check fails
- Watchdog cleans stale state before each restart attempt

Files changed (1) hide show
  1. start.sh +98 -16
start.sh CHANGED
@@ -20,6 +20,13 @@ done
20
 
21
  echo "Persistent storage ready."
22
 
 
 
 
 
 
 
 
23
  # Initialize MemPalace if not already
24
  PALACE_PATH="${MEMPALACE_PALACE_PATH:-/data/hermes/palace}"
25
  if [ ! -f "$PALACE_PATH/.palace_initialized" ]; then
@@ -31,25 +38,66 @@ else
31
  echo "MemPalace already initialized."
32
  fi
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  # Start Hermes Gateway (aiohttp API server on :8642 + Feishu platform)
35
- echo "Starting Hermes Gateway..."
36
- PYTHONUNBUFFERED=1 HERMES_ACCEPT_HOOKS=1 python3 -u -m hermes_cli.main gateway run -v \
37
- >> /data/hermes/logs/gateway.log 2>&1 &
38
- GATEWAY_PID=$!
39
- echo "Gateway PID: $GATEWAY_PID"
40
-
41
- # Wait for gateway to be ready
42
- echo "Waiting for Gateway to start..."
43
- for i in $(seq 1 30); do
44
- if curl -s http://127.0.0.1:8642/health > /dev/null 2>&1; then
45
- echo "Gateway is ready on :8642"
46
  break
47
  fi
48
- sleep 2
 
 
 
 
49
  done
50
 
 
 
 
 
51
  # Start hermes-web-ui Node.js BFF server on :6060
52
- echo "Starting hermes-web-ui BFF..."
53
  export PORT=6060
54
  export UPSTREAM=http://127.0.0.1:8642
55
  export HERMES_HOME=/root/.hermes
@@ -59,13 +107,13 @@ export NODE_ENV=production
59
  cd /app/webui-server
60
  node index.js >> /data/hermes/logs/webui.log 2>&1 &
61
  WEBUI_PID=$!
62
- echo "WebUI BFF PID: $WEBUI_PID"
63
 
64
  # Wait for WebUI BFF to be ready
65
- echo "Waiting for WebUI BFF to start..."
66
  for i in $(seq 1 30); do
67
  if curl -s http://127.0.0.1:6060/health > /dev/null 2>&1; then
68
- echo "WebUI BFF is ready on :6060"
69
  break
70
  fi
71
  sleep 2
@@ -79,5 +127,39 @@ echo " Proxy: http://0.0.0.0:7860"
79
  echo " Auth Token: $AUTH_TOKEN"
80
  echo ""
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  # Start Python proxy on :7860 (main HF Space port)
83
  exec python3 /app/entry.py
 
20
 
21
  echo "Persistent storage ready."
22
 
23
+ # ── Clean up stale PID/lock files from previous crash ──
24
+ echo "Cleaning up stale state..."
25
+ rm -f "$HERMES_HOME/gateway.pid" 2>/dev/null
26
+ rm -f "$HERMES_HOME/.gateway_runtime_lock" 2>/dev/null
27
+ rm -f "$HERMES_HOME/.gateway_takeover" 2>/dev/null
28
+ echo "Stale state cleaned."
29
+
30
  # Initialize MemPalace if not already
31
  PALACE_PATH="${MEMPALACE_PALACE_PATH:-/data/hermes/palace}"
32
  if [ ! -f "$PALACE_PATH/.palace_initialized" ]; then
 
38
  echo "MemPalace already initialized."
39
  fi
40
 
41
+ # ── Gateway management functions ──
42
+ start_gateway() {
43
+ # Clean stale PID/lock before each start attempt
44
+ rm -f "$HERMES_HOME/gateway.pid" 2>/dev/null
45
+ rm -f "$HERMES_HOME/.gateway_runtime_lock" 2>/dev/null
46
+ rm -f "$HERMES_HOME/.gateway_takeover" 2>/dev/null
47
+
48
+ echo "[$(date)] Starting Hermes Gateway..."
49
+ PYTHONUNBUFFERED=1 HERMES_ACCEPT_HOOKS=1 python3 -u -m hermes_cli.main gateway run -v \
50
+ >> /data/hermes/logs/gateway.log 2>&1 &
51
+ echo "$!" > /tmp/hermes-gateway.pid
52
+ echo "[$(date)] Gateway PID: $(cat /tmp/hermes-gateway.pid)"
53
+ }
54
+
55
+ wait_for_gateway() {
56
+ echo "[$(date)] Waiting for Gateway to start..."
57
+ for i in $(seq 1 45); do
58
+ if curl -s http://127.0.0.1:8642/health > /dev/null 2>&1; then
59
+ echo "[$(date)] Gateway is ready on :8642"
60
+ return 0
61
+ fi
62
+ # Check if process is still alive
63
+ if [ -f /tmp/hermes-gateway.pid ]; then
64
+ if ! kill -0 "$(cat /tmp/hermes-gateway.pid)" 2>/dev/null; then
65
+ echo "[$(date)] Gateway process died during startup (attempt $i/45)"
66
+ echo "[$(date)] --- Last 80 lines of gateway.log ---"
67
+ tail -80 /data/hermes/logs/gateway.log
68
+ echo "[$(date)] --- End gateway.log ---"
69
+ return 1
70
+ fi
71
+ fi
72
+ sleep 2
73
+ done
74
+ echo "[$(date)] Gateway startup timed out after 90s"
75
+ return 1
76
+ }
77
+
78
  # Start Hermes Gateway (aiohttp API server on :8642 + Feishu platform)
79
+ start_gateway
80
+
81
+ # Wait for gateway to be ready, with retries
82
+ GATEWAY_OK=false
83
+ for attempt in 1 2 3; do
84
+ if wait_for_gateway; then
85
+ GATEWAY_OK=true
 
 
 
 
86
  break
87
  fi
88
+ if [ $attempt -lt 3 ]; then
89
+ echo "[$(date)] Retrying gateway start (attempt $((attempt+1))/3)..."
90
+ sleep 5
91
+ start_gateway
92
+ fi
93
  done
94
 
95
+ if [ "$GATEWAY_OK" = false ]; then
96
+ echo "[$(date)] WARNING: Gateway failed after 3 attempts. Watchdog will keep retrying."
97
+ fi
98
+
99
  # Start hermes-web-ui Node.js BFF server on :6060
100
+ echo "[$(date)] Starting hermes-web-ui BFF..."
101
  export PORT=6060
102
  export UPSTREAM=http://127.0.0.1:8642
103
  export HERMES_HOME=/root/.hermes
 
107
  cd /app/webui-server
108
  node index.js >> /data/hermes/logs/webui.log 2>&1 &
109
  WEBUI_PID=$!
110
+ echo "[$(date)] WebUI BFF PID: $WEBUI_PID"
111
 
112
  # Wait for WebUI BFF to be ready
113
+ echo "[$(date)] Waiting for WebUI BFF to start..."
114
  for i in $(seq 1 30); do
115
  if curl -s http://127.0.0.1:6060/health > /dev/null 2>&1; then
116
+ echo "[$(date)] WebUI BFF is ready on :6060"
117
  break
118
  fi
119
  sleep 2
 
127
  echo " Auth Token: $AUTH_TOKEN"
128
  echo ""
129
 
130
+ # ── Start Gateway watchdog in background ──
131
+ (
132
+ echo "[$(date)] Gateway watchdog started (check interval: 30s)"
133
+ while true; do
134
+ sleep 30
135
+ # Check if gateway health endpoint responds
136
+ if ! curl -s --max-time 5 http://127.0.0.1:8642/health > /dev/null 2>&1; then
137
+ echo "[$(date)] Gateway health check FAILED — restarting..."
138
+ # Kill stale process if any
139
+ if [ -f /tmp/hermes-gateway.pid ]; then
140
+ OLD_PID=$(cat /tmp/hermes-gateway.pid)
141
+ kill -9 "$OLD_PID" 2>/dev/null || true
142
+ fi
143
+ # Append diagnostic info to log
144
+ echo "--- Gateway watchdog restart $(date) ---" >> /data/hermes/logs/gateway.log
145
+ # Clean stale state
146
+ rm -f "$HERMES_HOME/gateway.pid" 2>/dev/null
147
+ rm -f "$HERMES_HOME/.gateway_runtime_lock" 2>/dev/null
148
+ rm -f "$HERMES_HOME/.gateway_takeover" 2>/dev/null
149
+ # Restart
150
+ start_gateway
151
+ # Brief wait then verify
152
+ sleep 15
153
+ if curl -s --max-time 5 http://127.0.0.1:8642/health > /dev/null 2>&1; then
154
+ echo "[$(date)] Gateway restarted successfully"
155
+ else
156
+ echo "[$(date)] Gateway restart failed — will retry in 30s"
157
+ fi
158
+ fi
159
+ done
160
+ ) &
161
+ WATCHDOG_PID=$!
162
+ echo "[$(date)] Watchdog PID: $WATCHDOG_PID"
163
+
164
  # Start Python proxy on :7860 (main HF Space port)
165
  exec python3 /app/entry.py