Z User commited on
Commit
7cae251
·
1 Parent(s): 6414e6b

fix: bind api_server to 127.0.0.1 + fix watchdog process detection

Browse files

Root cause: api_server refused to bind 0.0.0.0 without API_SERVER_KEY,
so the /health endpoint on :8642 was never available. But the Gateway
process WAS running and connected to Feishu WebSocket.

Fix 1: Change api_server host from 0.0.0.0 to 127.0.0.1 (only local
access needed since entry.py proxy handles external requests)

Fix 2: Watchdog now checks process liveness (kill -0 PID) instead of
HTTP health check, preventing false restart of a running gateway

Files changed (2) hide show
  1. config.yaml +1 -1
  2. start.sh +21 -14
config.yaml CHANGED
@@ -13,7 +13,7 @@ platforms:
13
  api_server:
14
  enabled: true
15
  extra:
16
- host: 0.0.0.0
17
  port: 8642
18
  cors_origins: "*"
19
  memory:
 
13
  api_server:
14
  enabled: true
15
  extra:
16
+ host: 127.0.0.1
17
  port: 8642
18
  cors_origins: "*"
19
  memory:
start.sh CHANGED
@@ -129,31 +129,38 @@ echo ""
129
 
130
  # ── Start Gateway watchdog in background ──
131
  (
132
- echo "[$(date)] Gateway watchdog started (check interval: 30s)"
133
  while true; do
134
- sleep 30
135
- # Check if gateway health endpoint responds
136
- if ! curl -s --max-time 5 http://127.0.0.1:8642/health > /dev/null 2>&1; then
137
- echo "[$(date)] Gateway health check FAILED — restarting..."
138
- # Kill stale process if any
139
- if [ -f /tmp/hermes-gateway.pid ]; then
140
- OLD_PID=$(cat /tmp/hermes-gateway.pid)
141
- kill -9 "$OLD_PID" 2>/dev/null || true
 
 
 
 
 
 
142
  fi
143
- # Append diagnostic info to log
144
- echo "--- Gateway watchdog restart $(date) ---" >> /data/hermes/logs/gateway.log
 
 
145
  # Clean stale state
146
  rm -f "$HERMES_HOME/gateway.pid" 2>/dev/null
147
  rm -f "$HERMES_HOME/.gateway_runtime_lock" 2>/dev/null
148
  rm -f "$HERMES_HOME/.gateway_takeover" 2>/dev/null
149
  # Restart
150
  start_gateway
151
- # Brief wait then verify
152
  sleep 15
153
- if curl -s --max-time 5 http://127.0.0.1:8642/health > /dev/null 2>&1; then
154
  echo "[$(date)] Gateway restarted successfully"
155
  else
156
- echo "[$(date)] Gateway restart failed — will retry in 30s"
157
  fi
158
  fi
159
  done
 
129
 
130
  # ── Start Gateway watchdog in background ──
131
  (
132
+ echo "[$(date)] Gateway watchdog started (check interval: 60s)"
133
  while true; do
134
+ sleep 60
135
+ # Check if gateway process is still alive
136
+ GW_ALIVE=false
137
+ if [ -f /tmp/hermes-gateway.pid ]; then
138
+ if kill -0 "$(cat /tmp/hermes-gateway.pid)" 2>/dev/null; then
139
+ GW_ALIVE=true
140
+ fi
141
+ fi
142
+ # Also check via gateway.pid in hermes home
143
+ if [ "$GW_ALIVE" = false ] && [ -f "$HERMES_HOME/gateway.pid" ]; then
144
+ if kill -0 "$(cat "$HERMES_HOME/gateway.pid")" 2>/dev/null; then
145
+ GW_ALIVE=true
146
+ # Sync the PID file
147
+ cp "$HERMES_HOME/gateway.pid" /tmp/hermes-gateway.pid
148
  fi
149
+ fi
150
+
151
+ if [ "$GW_ALIVE" = false ]; then
152
+ echo "[$(date)] Gateway process is DEAD — restarting..."
153
  # Clean stale state
154
  rm -f "$HERMES_HOME/gateway.pid" 2>/dev/null
155
  rm -f "$HERMES_HOME/.gateway_runtime_lock" 2>/dev/null
156
  rm -f "$HERMES_HOME/.gateway_takeover" 2>/dev/null
157
  # Restart
158
  start_gateway
 
159
  sleep 15
160
+ if [ -f /tmp/hermes-gateway.pid ] && kill -0 "$(cat /tmp/hermes-gateway.pid)" 2>/dev/null; then
161
  echo "[$(date)] Gateway restarted successfully"
162
  else
163
+ echo "[$(date)] Gateway restart failed — will retry in 60s"
164
  fi
165
  fi
166
  done