Z User commited on
Commit
189ae90
·
1 Parent(s): 7cae251

fix: add Python-based gateway watchdog inside entry.py

Browse files

Root cause: Gateway received SIGTERM twice in 30s. The shell-level
watchdog in start.sh also died because 'set -e' caused cascading
process termination. Gateway was left orphaned with no restart.

Fix 1: Python watchdog thread inside entry.py (daemon thread)
- Monitors gateway process every 30s using psutil
- Auto-restarts with --replace flag if gateway dies
- Uses start_new_session=True to decouple from entry.py signals
- Exponential backoff (30s → 60s → 120s → ... max 5min)
- Falls back to process name search if PID files are missing
- Survives shell-level SIGTERM because it's inside entry.py

Fix 2: Remove 'set -e' from start.sh
- Gateway restarts should NOT kill the entire script
- Shell-level startup is best-effort only
- Python watchdog in entry.py handles all restart logic

Files changed (2) hide show
  1. entry.py +123 -0
  2. start.sh +27 -97
entry.py CHANGED
@@ -624,6 +624,124 @@ class ProxyHandler(BaseHTTPRequestHandler):
624
  return []
625
 
626
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
627
  # ---------------------------------------------------------------------------
628
  # Main
629
  # ---------------------------------------------------------------------------
@@ -642,6 +760,11 @@ def main():
642
  tailer.start()
643
  logger.info("Log tailer started")
644
 
 
 
 
 
 
645
  # Check if backend services are reachable
646
  for attempt in range(10):
647
  try:
 
624
  return []
625
 
626
 
627
+ # ---------------------------------------------------------------------------
628
+ # Python-based Gateway Watchdog (robust, lives inside entry.py)
629
+ # ---------------------------------------------------------------------------
630
+
631
+ def _gateway_watchdog(interval: int = 30):
632
+ """Monitor gateway process liveness and auto-restart if it dies.
633
+
634
+ This runs as a daemon thread inside entry.py so it survives
635
+ SIGTERM to the shell-level watchdog in start.sh.
636
+ """
637
+ import signal as _signal
638
+ pid_file = os.path.join(HERMES_HOME, "gateway.pid")
639
+ alt_pid_file = "/tmp/hermes-gateway.pid"
640
+ lock_file = os.path.join(HERMES_HOME, ".gateway_runtime_lock")
641
+ takeover_file = os.path.join(HERMES_HOME, ".gateway_takeover")
642
+ gw_log = "/data/hermes/logs/gateway.log"
643
+
644
+ def _find_gateway_pid() -> int | None:
645
+ """Find the gateway PID from pid files or by process name."""
646
+ for pf in (pid_file, alt_pid_file):
647
+ try:
648
+ with open(pf) as f:
649
+ pid = int(f.read().strip())
650
+ if pid > 0 and psutil.pid_exists(pid):
651
+ proc = psutil.Process(pid)
652
+ if "hermes" in " ".join(proc.cmdline()).lower() or "gateway" in " ".join(proc.cmdline()).lower():
653
+ return pid
654
+ except Exception:
655
+ pass
656
+ # Fallback: search by process name
657
+ try:
658
+ for proc in psutil.process_iter(["pid", "cmdline"]):
659
+ try:
660
+ cmdline = " ".join(proc.info["cmdline"] or [])
661
+ if "hermes_cli.main" in cmdline and "gateway" in cmdline:
662
+ return proc.info["pid"]
663
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
664
+ pass
665
+ except Exception:
666
+ pass
667
+ return None
668
+
669
+ def _start_gateway():
670
+ """Start the gateway process."""
671
+ # Clean stale state
672
+ for f in (pid_file, lock_file, takeover_file):
673
+ try:
674
+ os.remove(f)
675
+ except Exception:
676
+ pass
677
+
678
+ log_fh = None
679
+ try:
680
+ log_fh = open(gw_log, "a")
681
+ log_fh.write(f"\n--- [Python Watchdog] Starting gateway at {datetime.now().isoformat()} ---\n")
682
+ log_fh.flush()
683
+ except Exception:
684
+ pass
685
+
686
+ env = os.environ.copy()
687
+ env["PYTHONUNBUFFERED"] = "1"
688
+ env["HERMES_ACCEPT_HOOKS"] = "1"
689
+ proc = subprocess.Popen(
690
+ [sys.executable, "-u", "-m", "hermes_cli.main", "gateway", "run", "-v", "--replace"],
691
+ stdout=log_fh if log_fh else subprocess.DEVNULL,
692
+ stderr=subprocess.STDOUT,
693
+ env=env,
694
+ start_new_session=True, # decouple from entry.py signals
695
+ )
696
+ try:
697
+ with open(alt_pid_file, "w") as f:
698
+ f.write(str(proc.pid))
699
+ except Exception:
700
+ pass
701
+ return proc
702
+
703
+ logger.info("[Watchdog] Python gateway watchdog started (interval=%ds)", interval)
704
+ time.sleep(10) # Let initial startup settle
705
+
706
+ last_restart = 0
707
+ restart_backoff = 30
708
+
709
+ while True:
710
+ try:
711
+ time.sleep(interval)
712
+
713
+ gw_pid = _find_gateway_pid()
714
+ if gw_pid is not None:
715
+ # Gateway is alive, reset backoff
716
+ restart_backoff = 30
717
+ continue
718
+
719
+ # Gateway is dead — restart with backoff
720
+ now = time.time()
721
+ if now - last_restart < restart_backoff:
722
+ continue
723
+
724
+ logger.warning("[Watchdog] Gateway process is DEAD — restarting...")
725
+ proc = _start_gateway()
726
+ last_restart = now
727
+ restart_backoff = min(restart_backoff * 2, 300) # exponential backoff, max 5min
728
+
729
+ # Wait and verify
730
+ time.sleep(15)
731
+ try:
732
+ if proc.poll() is None:
733
+ logger.info("[Watchdog] Gateway restarted successfully (PID=%d)", proc.pid)
734
+ restart_backoff = 30 # reset on success
735
+ else:
736
+ logger.error("[Watchdog] Gateway exited immediately with code %d", proc.returncode)
737
+ except Exception as e:
738
+ logger.error("[Watchdog] Failed to verify restart: %s", e)
739
+
740
+ except Exception as e:
741
+ logger.error("[Watchdog] Error: %s", e)
742
+ time.sleep(10)
743
+
744
+
745
  # ---------------------------------------------------------------------------
746
  # Main
747
  # ---------------------------------------------------------------------------
 
760
  tailer.start()
761
  logger.info("Log tailer started")
762
 
763
+ # Start Python-based gateway watchdog (survives shell death)
764
+ watchdog = threading.Thread(target=_gateway_watchdog, args=(30,), daemon=True, name="gateway-watchdog")
765
+ watchdog.start()
766
+ logger.info("Python gateway watchdog started")
767
+
768
  # Check if backend services are reachable
769
  for attempt in range(10):
770
  try:
start.sh CHANGED
@@ -1,5 +1,6 @@
1
  #!/bin/bash
2
- set -e
 
3
 
4
  echo "=== Hermes Bot — HuggingFace Space Startup ==="
5
 
@@ -25,6 +26,7 @@ echo "Cleaning up stale state..."
25
  rm -f "$HERMES_HOME/gateway.pid" 2>/dev/null
26
  rm -f "$HERMES_HOME/.gateway_runtime_lock" 2>/dev/null
27
  rm -f "$HERMES_HOME/.gateway_takeover" 2>/dev/null
 
28
  echo "Stale state cleaned."
29
 
30
  # Initialize MemPalace if not already
@@ -38,65 +40,32 @@ else
38
  echo "MemPalace already initialized."
39
  fi
40
 
41
- # ── Gateway management functions ──
42
- start_gateway() {
43
- # Clean stale PID/lock before each start attempt
44
- rm -f "$HERMES_HOME/gateway.pid" 2>/dev/null
45
- rm -f "$HERMES_HOME/.gateway_runtime_lock" 2>/dev/null
46
- rm -f "$HERMES_HOME/.gateway_takeover" 2>/dev/null
 
47
 
48
- echo "[$(date)] Starting Hermes Gateway..."
49
- PYTHONUNBUFFERED=1 HERMES_ACCEPT_HOOKS=1 python3 -u -m hermes_cli.main gateway run -v \
50
- >> /data/hermes/logs/gateway.log 2>&1 &
51
- echo "$!" > /tmp/hermes-gateway.pid
52
- echo "[$(date)] Gateway PID: $(cat /tmp/hermes-gateway.pid)"
53
- }
54
-
55
- wait_for_gateway() {
56
- echo "[$(date)] Waiting for Gateway to start..."
57
- for i in $(seq 1 45); do
58
- if curl -s http://127.0.0.1:8642/health > /dev/null 2>&1; then
59
- echo "[$(date)] Gateway is ready on :8642"
60
- return 0
61
- fi
62
- # Check if process is still alive
63
- if [ -f /tmp/hermes-gateway.pid ]; then
64
- if ! kill -0 "$(cat /tmp/hermes-gateway.pid)" 2>/dev/null; then
65
- echo "[$(date)] Gateway process died during startup (attempt $i/45)"
66
- echo "[$(date)] --- Last 80 lines of gateway.log ---"
67
- tail -80 /data/hermes/logs/gateway.log
68
- echo "[$(date)] --- End gateway.log ---"
69
- return 1
70
- fi
71
- fi
72
- sleep 2
73
- done
74
- echo "[$(date)] Gateway startup timed out after 90s"
75
- return 1
76
- }
77
-
78
- # Start Hermes Gateway (aiohttp API server on :8642 + Feishu platform)
79
- start_gateway
80
-
81
- # Wait for gateway to be ready, with retries
82
- GATEWAY_OK=false
83
- for attempt in 1 2 3; do
84
- if wait_for_gateway; then
85
- GATEWAY_OK=true
86
  break
87
  fi
88
- if [ $attempt -lt 3 ]; then
89
- echo "[$(date)] Retrying gateway start (attempt $((attempt+1))/3)..."
90
- sleep 5
91
- start_gateway
 
 
92
  fi
 
93
  done
94
 
95
- if [ "$GATEWAY_OK" = false ]; then
96
- echo "[$(date)] WARNING: Gateway failed after 3 attempts. Watchdog will keep retrying."
97
- fi
98
-
99
- # Start hermes-web-ui Node.js BFF server on :6060
100
  echo "[$(date)] Starting hermes-web-ui BFF..."
101
  export PORT=6060
102
  export UPSTREAM=http://127.0.0.1:8642
@@ -111,7 +80,7 @@ echo "[$(date)] WebUI BFF PID: $WEBUI_PID"
111
 
112
  # Wait for WebUI BFF to be ready
113
  echo "[$(date)] Waiting for WebUI BFF to start..."
114
- for i in $(seq 1 30); do
115
  if curl -s http://127.0.0.1:6060/health > /dev/null 2>&1; then
116
  echo "[$(date)] WebUI BFF is ready on :6060"
117
  break
@@ -121,52 +90,13 @@ done
121
 
122
  echo ""
123
  echo "=== All services started ==="
124
- echo " Gateway: http://127.0.0.1:8642"
125
  echo " WebUI: http://127.0.0.1:6060"
126
  echo " Proxy: http://0.0.0.0:7860"
127
  echo " Auth Token: $AUTH_TOKEN"
128
  echo ""
129
 
130
- # ── Start Gateway watchdog in background ──
131
- (
132
- echo "[$(date)] Gateway watchdog started (check interval: 60s)"
133
- while true; do
134
- sleep 60
135
- # Check if gateway process is still alive
136
- GW_ALIVE=false
137
- if [ -f /tmp/hermes-gateway.pid ]; then
138
- if kill -0 "$(cat /tmp/hermes-gateway.pid)" 2>/dev/null; then
139
- GW_ALIVE=true
140
- fi
141
- fi
142
- # Also check via gateway.pid in hermes home
143
- if [ "$GW_ALIVE" = false ] && [ -f "$HERMES_HOME/gateway.pid" ]; then
144
- if kill -0 "$(cat "$HERMES_HOME/gateway.pid")" 2>/dev/null; then
145
- GW_ALIVE=true
146
- # Sync the PID file
147
- cp "$HERMES_HOME/gateway.pid" /tmp/hermes-gateway.pid
148
- fi
149
- fi
150
-
151
- if [ "$GW_ALIVE" = false ]; then
152
- echo "[$(date)] Gateway process is DEAD — restarting..."
153
- # Clean stale state
154
- rm -f "$HERMES_HOME/gateway.pid" 2>/dev/null
155
- rm -f "$HERMES_HOME/.gateway_runtime_lock" 2>/dev/null
156
- rm -f "$HERMES_HOME/.gateway_takeover" 2>/dev/null
157
- # Restart
158
- start_gateway
159
- sleep 15
160
- if [ -f /tmp/hermes-gateway.pid ] && kill -0 "$(cat /tmp/hermes-gateway.pid)" 2>/dev/null; then
161
- echo "[$(date)] Gateway restarted successfully"
162
- else
163
- echo "[$(date)] Gateway restart failed — will retry in 60s"
164
- fi
165
- fi
166
- done
167
- ) &
168
- WATCHDOG_PID=$!
169
- echo "[$(date)] Watchdog PID: $WATCHDOG_PID"
170
-
171
  # Start Python proxy on :7860 (main HF Space port)
 
 
172
  exec python3 /app/entry.py
 
1
  #!/bin/bash
2
+ # Hermes Bot — HuggingFace Space Startup
3
+ # NOTE: No 'set -e' — gateway restarts should not kill the entire script
4
 
5
  echo "=== Hermes Bot — HuggingFace Space Startup ==="
6
 
 
26
  rm -f "$HERMES_HOME/gateway.pid" 2>/dev/null
27
  rm -f "$HERMES_HOME/.gateway_runtime_lock" 2>/dev/null
28
  rm -f "$HERMES_HOME/.gateway_takeover" 2>/dev/null
29
+ rm -f /tmp/hermes-gateway.pid 2>/dev/null
30
  echo "Stale state cleaned."
31
 
32
  # Initialize MemPalace if not already
 
40
  echo "MemPalace already initialized."
41
  fi
42
 
43
+ # ── Start Hermes Gateway ──
44
+ echo "[$(date)] Starting Hermes Gateway..."
45
+ PYTHONUNBUFFERED=1 HERMES_ACCEPT_HOOKS=1 python3 -u -m hermes_cli.main gateway run -v \
46
+ >> /data/hermes/logs/gateway.log 2>&1 &
47
+ GATEWAY_PID=$!
48
+ echo "[$(date)] Gateway PID: $GATEWAY_PID"
49
+ echo "$GATEWAY_PID" > /tmp/hermes-gateway.pid
50
 
51
+ # Wait for gateway to be ready (best-effort, no set -e)
52
+ echo "[$(date)] Waiting for Gateway to start..."
53
+ for i in $(seq 1 30); do
54
+ if curl -s http://127.0.0.1:8642/health > /dev/null 2>&1; then
55
+ echo "[$(date)] Gateway is ready on :8642"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  break
57
  fi
58
+ # Check if process is still alive
59
+ if ! kill -0 "$GATEWAY_PID" 2>/dev/null; then
60
+ echo "[$(date)] Gateway process died during startup"
61
+ tail -30 /data/hermes/logs/gateway.log 2>/dev/null
62
+ echo "[$(date)] NOTE: Python watchdog in entry.py will auto-restart gateway"
63
+ break
64
  fi
65
+ sleep 2
66
  done
67
 
68
+ # ── Start hermes-web-ui Node.js BFF server on :6060 ──
 
 
 
 
69
  echo "[$(date)] Starting hermes-web-ui BFF..."
70
  export PORT=6060
71
  export UPSTREAM=http://127.0.0.1:8642
 
80
 
81
  # Wait for WebUI BFF to be ready
82
  echo "[$(date)] Waiting for WebUI BFF to start..."
83
+ for i in $(seq 1 15); do
84
  if curl -s http://127.0.0.1:6060/health > /dev/null 2>&1; then
85
  echo "[$(date)] WebUI BFF is ready on :6060"
86
  break
 
90
 
91
  echo ""
92
  echo "=== All services started ==="
93
+ echo " Gateway: http://127.0.0.1:8642 (with Python watchdog in entry.py)"
94
  echo " WebUI: http://127.0.0.1:6060"
95
  echo " Proxy: http://0.0.0.0:7860"
96
  echo " Auth Token: $AUTH_TOKEN"
97
  echo ""
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  # Start Python proxy on :7860 (main HF Space port)
100
+ # entry.py contains a Python-based gateway watchdog that will auto-restart
101
+ # the gateway if it dies, regardless of what happens to this shell script
102
  exec python3 /app/entry.py