Spaces:
Running
fix: add Python-based gateway watchdog inside entry.py
Browse filesRoot cause: Gateway received SIGTERM twice in 30s. The shell-level
watchdog in start.sh also died because 'set -e' caused cascading
process termination. Gateway was left orphaned with no restart.
Fix 1: Python watchdog thread inside entry.py (daemon thread)
- Monitors gateway process every 30s using psutil
- Auto-restarts with --replace flag if gateway dies
- Uses start_new_session=True to decouple from entry.py signals
- Exponential backoff (30s → 60s → 120s → ... max 5min)
- Falls back to process name search if PID files are missing
- Survives shell-level SIGTERM because it's inside entry.py
Fix 2: Remove 'set -e' from start.sh
- Gateway restarts should NOT kill the entire script
- Shell-level startup is best-effort only
- Python watchdog in entry.py handles all restart logic
|
@@ -624,6 +624,124 @@ class ProxyHandler(BaseHTTPRequestHandler):
|
|
| 624 |
return []
|
| 625 |
|
| 626 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 627 |
# ---------------------------------------------------------------------------
|
| 628 |
# Main
|
| 629 |
# ---------------------------------------------------------------------------
|
|
@@ -642,6 +760,11 @@ def main():
|
|
| 642 |
tailer.start()
|
| 643 |
logger.info("Log tailer started")
|
| 644 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 645 |
# Check if backend services are reachable
|
| 646 |
for attempt in range(10):
|
| 647 |
try:
|
|
|
|
| 624 |
return []
|
| 625 |
|
| 626 |
|
| 627 |
+
# ---------------------------------------------------------------------------
|
| 628 |
+
# Python-based Gateway Watchdog (robust, lives inside entry.py)
|
| 629 |
+
# ---------------------------------------------------------------------------
|
| 630 |
+
|
| 631 |
+
def _gateway_watchdog(interval: int = 30):
|
| 632 |
+
"""Monitor gateway process liveness and auto-restart if it dies.
|
| 633 |
+
|
| 634 |
+
This runs as a daemon thread inside entry.py so it survives
|
| 635 |
+
SIGTERM to the shell-level watchdog in start.sh.
|
| 636 |
+
"""
|
| 637 |
+
import signal as _signal
|
| 638 |
+
pid_file = os.path.join(HERMES_HOME, "gateway.pid")
|
| 639 |
+
alt_pid_file = "/tmp/hermes-gateway.pid"
|
| 640 |
+
lock_file = os.path.join(HERMES_HOME, ".gateway_runtime_lock")
|
| 641 |
+
takeover_file = os.path.join(HERMES_HOME, ".gateway_takeover")
|
| 642 |
+
gw_log = "/data/hermes/logs/gateway.log"
|
| 643 |
+
|
| 644 |
+
def _find_gateway_pid() -> int | None:
|
| 645 |
+
"""Find the gateway PID from pid files or by process name."""
|
| 646 |
+
for pf in (pid_file, alt_pid_file):
|
| 647 |
+
try:
|
| 648 |
+
with open(pf) as f:
|
| 649 |
+
pid = int(f.read().strip())
|
| 650 |
+
if pid > 0 and psutil.pid_exists(pid):
|
| 651 |
+
proc = psutil.Process(pid)
|
| 652 |
+
if "hermes" in " ".join(proc.cmdline()).lower() or "gateway" in " ".join(proc.cmdline()).lower():
|
| 653 |
+
return pid
|
| 654 |
+
except Exception:
|
| 655 |
+
pass
|
| 656 |
+
# Fallback: search by process name
|
| 657 |
+
try:
|
| 658 |
+
for proc in psutil.process_iter(["pid", "cmdline"]):
|
| 659 |
+
try:
|
| 660 |
+
cmdline = " ".join(proc.info["cmdline"] or [])
|
| 661 |
+
if "hermes_cli.main" in cmdline and "gateway" in cmdline:
|
| 662 |
+
return proc.info["pid"]
|
| 663 |
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
| 664 |
+
pass
|
| 665 |
+
except Exception:
|
| 666 |
+
pass
|
| 667 |
+
return None
|
| 668 |
+
|
| 669 |
+
def _start_gateway():
|
| 670 |
+
"""Start the gateway process."""
|
| 671 |
+
# Clean stale state
|
| 672 |
+
for f in (pid_file, lock_file, takeover_file):
|
| 673 |
+
try:
|
| 674 |
+
os.remove(f)
|
| 675 |
+
except Exception:
|
| 676 |
+
pass
|
| 677 |
+
|
| 678 |
+
log_fh = None
|
| 679 |
+
try:
|
| 680 |
+
log_fh = open(gw_log, "a")
|
| 681 |
+
log_fh.write(f"\n--- [Python Watchdog] Starting gateway at {datetime.now().isoformat()} ---\n")
|
| 682 |
+
log_fh.flush()
|
| 683 |
+
except Exception:
|
| 684 |
+
pass
|
| 685 |
+
|
| 686 |
+
env = os.environ.copy()
|
| 687 |
+
env["PYTHONUNBUFFERED"] = "1"
|
| 688 |
+
env["HERMES_ACCEPT_HOOKS"] = "1"
|
| 689 |
+
proc = subprocess.Popen(
|
| 690 |
+
[sys.executable, "-u", "-m", "hermes_cli.main", "gateway", "run", "-v", "--replace"],
|
| 691 |
+
stdout=log_fh if log_fh else subprocess.DEVNULL,
|
| 692 |
+
stderr=subprocess.STDOUT,
|
| 693 |
+
env=env,
|
| 694 |
+
start_new_session=True, # decouple from entry.py signals
|
| 695 |
+
)
|
| 696 |
+
try:
|
| 697 |
+
with open(alt_pid_file, "w") as f:
|
| 698 |
+
f.write(str(proc.pid))
|
| 699 |
+
except Exception:
|
| 700 |
+
pass
|
| 701 |
+
return proc
|
| 702 |
+
|
| 703 |
+
logger.info("[Watchdog] Python gateway watchdog started (interval=%ds)", interval)
|
| 704 |
+
time.sleep(10) # Let initial startup settle
|
| 705 |
+
|
| 706 |
+
last_restart = 0
|
| 707 |
+
restart_backoff = 30
|
| 708 |
+
|
| 709 |
+
while True:
|
| 710 |
+
try:
|
| 711 |
+
time.sleep(interval)
|
| 712 |
+
|
| 713 |
+
gw_pid = _find_gateway_pid()
|
| 714 |
+
if gw_pid is not None:
|
| 715 |
+
# Gateway is alive, reset backoff
|
| 716 |
+
restart_backoff = 30
|
| 717 |
+
continue
|
| 718 |
+
|
| 719 |
+
# Gateway is dead — restart with backoff
|
| 720 |
+
now = time.time()
|
| 721 |
+
if now - last_restart < restart_backoff:
|
| 722 |
+
continue
|
| 723 |
+
|
| 724 |
+
logger.warning("[Watchdog] Gateway process is DEAD — restarting...")
|
| 725 |
+
proc = _start_gateway()
|
| 726 |
+
last_restart = now
|
| 727 |
+
restart_backoff = min(restart_backoff * 2, 300) # exponential backoff, max 5min
|
| 728 |
+
|
| 729 |
+
# Wait and verify
|
| 730 |
+
time.sleep(15)
|
| 731 |
+
try:
|
| 732 |
+
if proc.poll() is None:
|
| 733 |
+
logger.info("[Watchdog] Gateway restarted successfully (PID=%d)", proc.pid)
|
| 734 |
+
restart_backoff = 30 # reset on success
|
| 735 |
+
else:
|
| 736 |
+
logger.error("[Watchdog] Gateway exited immediately with code %d", proc.returncode)
|
| 737 |
+
except Exception as e:
|
| 738 |
+
logger.error("[Watchdog] Failed to verify restart: %s", e)
|
| 739 |
+
|
| 740 |
+
except Exception as e:
|
| 741 |
+
logger.error("[Watchdog] Error: %s", e)
|
| 742 |
+
time.sleep(10)
|
| 743 |
+
|
| 744 |
+
|
| 745 |
# ---------------------------------------------------------------------------
|
| 746 |
# Main
|
| 747 |
# ---------------------------------------------------------------------------
|
|
|
|
| 760 |
tailer.start()
|
| 761 |
logger.info("Log tailer started")
|
| 762 |
|
| 763 |
+
# Start Python-based gateway watchdog (survives shell death)
|
| 764 |
+
watchdog = threading.Thread(target=_gateway_watchdog, args=(30,), daemon=True, name="gateway-watchdog")
|
| 765 |
+
watchdog.start()
|
| 766 |
+
logger.info("Python gateway watchdog started")
|
| 767 |
+
|
| 768 |
# Check if backend services are reachable
|
| 769 |
for attempt in range(10):
|
| 770 |
try:
|
|
@@ -1,5 +1,6 @@
|
|
| 1 |
#!/bin/bash
|
| 2 |
-
|
|
|
|
| 3 |
|
| 4 |
echo "=== Hermes Bot — HuggingFace Space Startup ==="
|
| 5 |
|
|
@@ -25,6 +26,7 @@ echo "Cleaning up stale state..."
|
|
| 25 |
rm -f "$HERMES_HOME/gateway.pid" 2>/dev/null
|
| 26 |
rm -f "$HERMES_HOME/.gateway_runtime_lock" 2>/dev/null
|
| 27 |
rm -f "$HERMES_HOME/.gateway_takeover" 2>/dev/null
|
|
|
|
| 28 |
echo "Stale state cleaned."
|
| 29 |
|
| 30 |
# Initialize MemPalace if not already
|
|
@@ -38,65 +40,32 @@ else
|
|
| 38 |
echo "MemPalace already initialized."
|
| 39 |
fi
|
| 40 |
|
| 41 |
-
# ──
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
|
|
|
| 47 |
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
}
|
| 54 |
-
|
| 55 |
-
wait_for_gateway() {
|
| 56 |
-
echo "[$(date)] Waiting for Gateway to start..."
|
| 57 |
-
for i in $(seq 1 45); do
|
| 58 |
-
if curl -s http://127.0.0.1:8642/health > /dev/null 2>&1; then
|
| 59 |
-
echo "[$(date)] Gateway is ready on :8642"
|
| 60 |
-
return 0
|
| 61 |
-
fi
|
| 62 |
-
# Check if process is still alive
|
| 63 |
-
if [ -f /tmp/hermes-gateway.pid ]; then
|
| 64 |
-
if ! kill -0 "$(cat /tmp/hermes-gateway.pid)" 2>/dev/null; then
|
| 65 |
-
echo "[$(date)] Gateway process died during startup (attempt $i/45)"
|
| 66 |
-
echo "[$(date)] --- Last 80 lines of gateway.log ---"
|
| 67 |
-
tail -80 /data/hermes/logs/gateway.log
|
| 68 |
-
echo "[$(date)] --- End gateway.log ---"
|
| 69 |
-
return 1
|
| 70 |
-
fi
|
| 71 |
-
fi
|
| 72 |
-
sleep 2
|
| 73 |
-
done
|
| 74 |
-
echo "[$(date)] Gateway startup timed out after 90s"
|
| 75 |
-
return 1
|
| 76 |
-
}
|
| 77 |
-
|
| 78 |
-
# Start Hermes Gateway (aiohttp API server on :8642 + Feishu platform)
|
| 79 |
-
start_gateway
|
| 80 |
-
|
| 81 |
-
# Wait for gateway to be ready, with retries
|
| 82 |
-
GATEWAY_OK=false
|
| 83 |
-
for attempt in 1 2 3; do
|
| 84 |
-
if wait_for_gateway; then
|
| 85 |
-
GATEWAY_OK=true
|
| 86 |
break
|
| 87 |
fi
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
| 92 |
fi
|
|
|
|
| 93 |
done
|
| 94 |
|
| 95 |
-
|
| 96 |
-
echo "[$(date)] WARNING: Gateway failed after 3 attempts. Watchdog will keep retrying."
|
| 97 |
-
fi
|
| 98 |
-
|
| 99 |
-
# Start hermes-web-ui Node.js BFF server on :6060
|
| 100 |
echo "[$(date)] Starting hermes-web-ui BFF..."
|
| 101 |
export PORT=6060
|
| 102 |
export UPSTREAM=http://127.0.0.1:8642
|
|
@@ -111,7 +80,7 @@ echo "[$(date)] WebUI BFF PID: $WEBUI_PID"
|
|
| 111 |
|
| 112 |
# Wait for WebUI BFF to be ready
|
| 113 |
echo "[$(date)] Waiting for WebUI BFF to start..."
|
| 114 |
-
for i in $(seq 1
|
| 115 |
if curl -s http://127.0.0.1:6060/health > /dev/null 2>&1; then
|
| 116 |
echo "[$(date)] WebUI BFF is ready on :6060"
|
| 117 |
break
|
|
@@ -121,52 +90,13 @@ done
|
|
| 121 |
|
| 122 |
echo ""
|
| 123 |
echo "=== All services started ==="
|
| 124 |
-
echo " Gateway: http://127.0.0.1:8642"
|
| 125 |
echo " WebUI: http://127.0.0.1:6060"
|
| 126 |
echo " Proxy: http://0.0.0.0:7860"
|
| 127 |
echo " Auth Token: $AUTH_TOKEN"
|
| 128 |
echo ""
|
| 129 |
|
| 130 |
-
# ── Start Gateway watchdog in background ──
|
| 131 |
-
(
|
| 132 |
-
echo "[$(date)] Gateway watchdog started (check interval: 60s)"
|
| 133 |
-
while true; do
|
| 134 |
-
sleep 60
|
| 135 |
-
# Check if gateway process is still alive
|
| 136 |
-
GW_ALIVE=false
|
| 137 |
-
if [ -f /tmp/hermes-gateway.pid ]; then
|
| 138 |
-
if kill -0 "$(cat /tmp/hermes-gateway.pid)" 2>/dev/null; then
|
| 139 |
-
GW_ALIVE=true
|
| 140 |
-
fi
|
| 141 |
-
fi
|
| 142 |
-
# Also check via gateway.pid in hermes home
|
| 143 |
-
if [ "$GW_ALIVE" = false ] && [ -f "$HERMES_HOME/gateway.pid" ]; then
|
| 144 |
-
if kill -0 "$(cat "$HERMES_HOME/gateway.pid")" 2>/dev/null; then
|
| 145 |
-
GW_ALIVE=true
|
| 146 |
-
# Sync the PID file
|
| 147 |
-
cp "$HERMES_HOME/gateway.pid" /tmp/hermes-gateway.pid
|
| 148 |
-
fi
|
| 149 |
-
fi
|
| 150 |
-
|
| 151 |
-
if [ "$GW_ALIVE" = false ]; then
|
| 152 |
-
echo "[$(date)] Gateway process is DEAD — restarting..."
|
| 153 |
-
# Clean stale state
|
| 154 |
-
rm -f "$HERMES_HOME/gateway.pid" 2>/dev/null
|
| 155 |
-
rm -f "$HERMES_HOME/.gateway_runtime_lock" 2>/dev/null
|
| 156 |
-
rm -f "$HERMES_HOME/.gateway_takeover" 2>/dev/null
|
| 157 |
-
# Restart
|
| 158 |
-
start_gateway
|
| 159 |
-
sleep 15
|
| 160 |
-
if [ -f /tmp/hermes-gateway.pid ] && kill -0 "$(cat /tmp/hermes-gateway.pid)" 2>/dev/null; then
|
| 161 |
-
echo "[$(date)] Gateway restarted successfully"
|
| 162 |
-
else
|
| 163 |
-
echo "[$(date)] Gateway restart failed — will retry in 60s"
|
| 164 |
-
fi
|
| 165 |
-
fi
|
| 166 |
-
done
|
| 167 |
-
) &
|
| 168 |
-
WATCHDOG_PID=$!
|
| 169 |
-
echo "[$(date)] Watchdog PID: $WATCHDOG_PID"
|
| 170 |
-
|
| 171 |
# Start Python proxy on :7860 (main HF Space port)
|
|
|
|
|
|
|
| 172 |
exec python3 /app/entry.py
|
|
|
|
| 1 |
#!/bin/bash
|
| 2 |
+
# Hermes Bot — HuggingFace Space Startup
|
| 3 |
+
# NOTE: No 'set -e' — gateway restarts should not kill the entire script
|
| 4 |
|
| 5 |
echo "=== Hermes Bot — HuggingFace Space Startup ==="
|
| 6 |
|
|
|
|
| 26 |
rm -f "$HERMES_HOME/gateway.pid" 2>/dev/null
|
| 27 |
rm -f "$HERMES_HOME/.gateway_runtime_lock" 2>/dev/null
|
| 28 |
rm -f "$HERMES_HOME/.gateway_takeover" 2>/dev/null
|
| 29 |
+
rm -f /tmp/hermes-gateway.pid 2>/dev/null
|
| 30 |
echo "Stale state cleaned."
|
| 31 |
|
| 32 |
# Initialize MemPalace if not already
|
|
|
|
| 40 |
echo "MemPalace already initialized."
|
| 41 |
fi
|
| 42 |
|
| 43 |
+
# ── Start Hermes Gateway ──
|
| 44 |
+
echo "[$(date)] Starting Hermes Gateway..."
|
| 45 |
+
PYTHONUNBUFFERED=1 HERMES_ACCEPT_HOOKS=1 python3 -u -m hermes_cli.main gateway run -v \
|
| 46 |
+
>> /data/hermes/logs/gateway.log 2>&1 &
|
| 47 |
+
GATEWAY_PID=$!
|
| 48 |
+
echo "[$(date)] Gateway PID: $GATEWAY_PID"
|
| 49 |
+
echo "$GATEWAY_PID" > /tmp/hermes-gateway.pid
|
| 50 |
|
| 51 |
+
# Wait for gateway to be ready (best-effort, no set -e)
|
| 52 |
+
echo "[$(date)] Waiting for Gateway to start..."
|
| 53 |
+
for i in $(seq 1 30); do
|
| 54 |
+
if curl -s http://127.0.0.1:8642/health > /dev/null 2>&1; then
|
| 55 |
+
echo "[$(date)] Gateway is ready on :8642"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
break
|
| 57 |
fi
|
| 58 |
+
# Check if process is still alive
|
| 59 |
+
if ! kill -0 "$GATEWAY_PID" 2>/dev/null; then
|
| 60 |
+
echo "[$(date)] Gateway process died during startup"
|
| 61 |
+
tail -30 /data/hermes/logs/gateway.log 2>/dev/null
|
| 62 |
+
echo "[$(date)] NOTE: Python watchdog in entry.py will auto-restart gateway"
|
| 63 |
+
break
|
| 64 |
fi
|
| 65 |
+
sleep 2
|
| 66 |
done
|
| 67 |
|
| 68 |
+
# ── Start hermes-web-ui Node.js BFF server on :6060 ──
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
echo "[$(date)] Starting hermes-web-ui BFF..."
|
| 70 |
export PORT=6060
|
| 71 |
export UPSTREAM=http://127.0.0.1:8642
|
|
|
|
| 80 |
|
| 81 |
# Wait for WebUI BFF to be ready
|
| 82 |
echo "[$(date)] Waiting for WebUI BFF to start..."
|
| 83 |
+
for i in $(seq 1 15); do
|
| 84 |
if curl -s http://127.0.0.1:6060/health > /dev/null 2>&1; then
|
| 85 |
echo "[$(date)] WebUI BFF is ready on :6060"
|
| 86 |
break
|
|
|
|
| 90 |
|
| 91 |
echo ""
|
| 92 |
echo "=== All services started ==="
|
| 93 |
+
echo " Gateway: http://127.0.0.1:8642 (with Python watchdog in entry.py)"
|
| 94 |
echo " WebUI: http://127.0.0.1:6060"
|
| 95 |
echo " Proxy: http://0.0.0.0:7860"
|
| 96 |
echo " Auth Token: $AUTH_TOKEN"
|
| 97 |
echo ""
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
# Start Python proxy on :7860 (main HF Space port)
|
| 100 |
+
# entry.py contains a Python-based gateway watchdog that will auto-restart
|
| 101 |
+
# the gateway if it dies, regardless of what happens to this shell script
|
| 102 |
exec python3 /app/entry.py
|