Spaces:
Running
Running
AutoFix commited on
Commit ·
75f1efb
1
Parent(s): b1e29fe
fix: detect and auto-restart zombie gateway (executor shutdown)
Browse filesThe gateway can enter a zombie state where the process is alive and
health checks pass, but the asyncio executor has been shut down.
All message sends fail with 'RuntimeError: Executor shutdown has been called'.
The watchdog now scans the last 50 log lines for 'Executor shutdown'
errors. If detected, it force-kills and restarts the gateway.
entry.py
CHANGED
|
@@ -796,11 +796,40 @@ def _gateway_watchdog(interval: int = 30):
|
|
| 796 |
|
| 797 |
gw_pid = _find_gateway_pid()
|
| 798 |
if gw_pid is not None:
|
| 799 |
-
#
|
| 800 |
-
|
| 801 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 802 |
|
| 803 |
-
# Gateway is dead — restart with backoff
|
| 804 |
now = time.time()
|
| 805 |
if now - last_restart < restart_backoff:
|
| 806 |
continue
|
|
|
|
| 796 |
|
| 797 |
gw_pid = _find_gateway_pid()
|
| 798 |
if gw_pid is not None:
|
| 799 |
+
# Process is alive — but check if gateway is FUNCTIONAL (not zombie)
|
| 800 |
+
# A zombie gateway has a live process but dead asyncio executor,
|
| 801 |
+
# causing "Executor shutdown has been called" on every send.
|
| 802 |
+
try:
|
| 803 |
+
# Check gateway log for executor shutdown errors in last 60s
|
| 804 |
+
now_ts = time.time()
|
| 805 |
+
_zombie = False
|
| 806 |
+
if os.path.isfile(gw_log):
|
| 807 |
+
with open(gw_log, "r", errors="replace") as _lf:
|
| 808 |
+
for _line in _lf.readlines()[-50:]:
|
| 809 |
+
if "Executor shutdown" in _line and "RuntimeError" in _line:
|
| 810 |
+
_zombie = True
|
| 811 |
+
break
|
| 812 |
+
if _zombie:
|
| 813 |
+
logger.error("[Watchdog] Gateway is ZOMBIE (executor shutdown) — force-restarting...")
|
| 814 |
+
killed = _kill_all_gateways()
|
| 815 |
+
if not killed:
|
| 816 |
+
try:
|
| 817 |
+
os.kill(gw_pid, 9)
|
| 818 |
+
except Exception:
|
| 819 |
+
pass
|
| 820 |
+
time.sleep(3)
|
| 821 |
+
# Fall through to restart below
|
| 822 |
+
else:
|
| 823 |
+
restart_backoff = 30
|
| 824 |
+
continue
|
| 825 |
+
except Exception:
|
| 826 |
+
# If log check fails, assume gateway is fine
|
| 827 |
+
restart_backoff = 30
|
| 828 |
+
continue
|
| 829 |
+
else:
|
| 830 |
+
logger.warning("[Watchdog] Gateway process is DEAD — restarting...")
|
| 831 |
|
| 832 |
+
# Gateway is dead or zombie — restart with backoff
|
| 833 |
now = time.time()
|
| 834 |
if now - last_restart < restart_backoff:
|
| 835 |
continue
|