Spaces:
Running
fix: robust gateway restart — force-kill residual processes before restart
Browse filesROOT CAUSE: When gateway crashes and watchdog restarts it, the old process
hasn't fully released the Feishu WebSocket lock or port 8642. The new gateway
then fails with:
- 'Another gateway is already using this Feishu app_id (PID xxx)'
- api_server falls back to port 8643 (wrong port)
- entry.py proxy still targets 8642 → complete disconnect
FIX (3 layers):
1. entry.py watchdog: new _kill_all_gateways() function that:
- SIGKILLs ALL hermes gateway processes (not just tracked PID)
- Kills any process holding ports 8642/8643
- Waits for ports to be released
- Cleans ALL lock files including Feishu locks
2. start.sh: force-kill residual gateway processes with kill -9 before
starting fresh, plus clean Feishu lock files
3. entry.py watchdog: increased initial settle time to 15s
|
@@ -666,15 +666,81 @@ def _gateway_watchdog(interval: int = 30):
|
|
| 666 |
pass
|
| 667 |
return None
|
| 668 |
|
| 669 |
-
def
|
| 670 |
-
"""
|
| 671 |
-
|
| 672 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 673 |
try:
|
| 674 |
os.remove(f)
|
| 675 |
except Exception:
|
| 676 |
pass
|
| 677 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 678 |
log_fh = None
|
| 679 |
try:
|
| 680 |
log_fh = open(gw_log, "a")
|
|
|
|
| 666 |
pass
|
| 667 |
return None
|
| 668 |
|
| 669 |
+
def _kill_all_gateways():
|
| 670 |
+
"""Force-kill ALL existing gateway processes and wait for ports to free.
|
| 671 |
+
|
| 672 |
+
This prevents the 'another gateway is already using this Feishu app_id'
|
| 673 |
+
error and port conflicts when restarting after a crash.
|
| 674 |
+
"""
|
| 675 |
+
killed = []
|
| 676 |
+
# Find ALL hermes gateway processes (not just the one we track)
|
| 677 |
+
try:
|
| 678 |
+
for proc in psutil.process_iter(["pid", "cmdline", "create_time"]):
|
| 679 |
+
try:
|
| 680 |
+
cmdline = " ".join(proc.info["cmdline"] or [])
|
| 681 |
+
if "hermes_cli.main" in cmdline and "gateway" in cmdline:
|
| 682 |
+
pid = proc.info["pid"]
|
| 683 |
+
logger.info("[Watchdog] Force-killing residual gateway PID=%d", pid)
|
| 684 |
+
proc.kill() # SIGKILL — no graceful shutdown, just die
|
| 685 |
+
killed.append(pid)
|
| 686 |
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
| 687 |
+
pass
|
| 688 |
+
except Exception as e:
|
| 689 |
+
logger.warning("[Watchdog] Error scanning processes: %s", e)
|
| 690 |
+
|
| 691 |
+
# Also kill any lingering aiohttp servers on port 8642/8643
|
| 692 |
+
for port in (8642, 8643):
|
| 693 |
+
try:
|
| 694 |
+
for conn in psutil.net_connections(kind='inet'):
|
| 695 |
+
if conn.laddr.port == port and conn.status == 'LISTEN':
|
| 696 |
+
try:
|
| 697 |
+
owner = psutil.Process(conn.pid)
|
| 698 |
+
if conn.pid not in killed:
|
| 699 |
+
logger.info("[Watchdog] Force-killing process %d on port %d", conn.pid, port)
|
| 700 |
+
owner.kill()
|
| 701 |
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
| 702 |
+
pass
|
| 703 |
+
except Exception:
|
| 704 |
+
pass
|
| 705 |
+
|
| 706 |
+
# Wait for processes to actually die and ports to be released
|
| 707 |
+
if killed:
|
| 708 |
+
time.sleep(3)
|
| 709 |
+
for pid in killed:
|
| 710 |
+
try:
|
| 711 |
+
p = psutil.Process(pid)
|
| 712 |
+
if p.is_running():
|
| 713 |
+
logger.warning("[Watchdog] PID %d still running after kill, waiting...", pid)
|
| 714 |
+
time.sleep(5)
|
| 715 |
+
except psutil.NoSuchProcess:
|
| 716 |
+
pass
|
| 717 |
+
|
| 718 |
+
# Clean ALL stale state files
|
| 719 |
+
for f in (pid_file, alt_pid_file, lock_file, takeover_file):
|
| 720 |
try:
|
| 721 |
os.remove(f)
|
| 722 |
except Exception:
|
| 723 |
pass
|
| 724 |
|
| 725 |
+
# Also clean any .feishu_lock or similar files
|
| 726 |
+
for pattern in ("feishu*.lock", "*.feishu_lock"):
|
| 727 |
+
import glob as _glob
|
| 728 |
+
for lock_f in _glob.glob(os.path.join(HERMES_HOME, pattern)):
|
| 729 |
+
try:
|
| 730 |
+
os.remove(lock_f)
|
| 731 |
+
logger.info("[Watchdog] Removed stale lock: %s", lock_f)
|
| 732 |
+
except Exception:
|
| 733 |
+
pass
|
| 734 |
+
|
| 735 |
+
return len(killed)
|
| 736 |
+
|
| 737 |
+
def _start_gateway():
|
| 738 |
+
"""Start the gateway process with full cleanup of any residual state."""
|
| 739 |
+
# Force-kill ALL residual gateway processes first
|
| 740 |
+
killed = _kill_all_gateways()
|
| 741 |
+
if killed:
|
| 742 |
+
logger.info("[Watchdog] Killed %d residual gateway process(es) before restart", killed)
|
| 743 |
+
|
| 744 |
log_fh = None
|
| 745 |
try:
|
| 746 |
log_fh = open(gw_log, "a")
|
|
@@ -52,12 +52,24 @@ if [ ! -L "$WEBUI_HOME" ]; then
|
|
| 52 |
echo "Symlink: hermes-web-ui -> $WEBUI_DATA"
|
| 53 |
fi
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
# ── Clean up stale PID/lock files from previous crash ──
|
| 56 |
echo "Cleaning up stale state..."
|
| 57 |
rm -f "$HERMES_HOME/gateway.pid" 2>/dev/null
|
| 58 |
rm -f "$HERMES_HOME/.gateway_runtime_lock" 2>/dev/null
|
| 59 |
rm -f "$HERMES_HOME/.gateway_takeover" 2>/dev/null
|
| 60 |
rm -f /tmp/hermes-gateway.pid 2>/dev/null
|
|
|
|
|
|
|
|
|
|
| 61 |
echo "Stale state cleaned."
|
| 62 |
|
| 63 |
# Initialize MemPalace if not already
|
|
|
|
| 52 |
echo "Symlink: hermes-web-ui -> $WEBUI_DATA"
|
| 53 |
fi
|
| 54 |
|
| 55 |
+
# ── Force-kill any residual gateway processes from previous crash ──
|
| 56 |
+
echo "Cleaning up residual gateway processes..."
|
| 57 |
+
# Kill any lingering hermes gateway processes (prevents Feishu lock conflict)
|
| 58 |
+
for pid in $(pgrep -f "hermes_cli.main.*gateway" 2>/dev/null); do
|
| 59 |
+
echo " Killing residual gateway PID=$pid"
|
| 60 |
+
kill -9 "$pid" 2>/dev/null
|
| 61 |
+
done
|
| 62 |
+
sleep 2 # Wait for processes and ports to be fully released
|
| 63 |
+
|
| 64 |
# ── Clean up stale PID/lock files from previous crash ──
|
| 65 |
echo "Cleaning up stale state..."
|
| 66 |
rm -f "$HERMES_HOME/gateway.pid" 2>/dev/null
|
| 67 |
rm -f "$HERMES_HOME/.gateway_runtime_lock" 2>/dev/null
|
| 68 |
rm -f "$HERMES_HOME/.gateway_takeover" 2>/dev/null
|
| 69 |
rm -f /tmp/hermes-gateway.pid 2>/dev/null
|
| 70 |
+
# Clean Feishu lock files
|
| 71 |
+
rm -f "$HERMES_HOME"/feishu*.lock 2>/dev/null
|
| 72 |
+
rm -f "$HERMES_HOME"/*.feishu_lock 2>/dev/null
|
| 73 |
echo "Stale state cleaned."
|
| 74 |
|
| 75 |
# Initialize MemPalace if not already
|