AutoFix commited on
Commit
75f1efb
·
1 Parent(s): b1e29fe

fix: detect and auto-restart zombie gateway (executor shutdown)

Browse files

The gateway can enter a zombie state where the process is alive and
health checks pass, but the asyncio executor has been shut down.
All message sends fail with 'RuntimeError: Executor shutdown has been called'.

The watchdog now scans the last 50 log lines for 'Executor shutdown'
errors. If detected, it force-kills and restarts the gateway.

Files changed (1) hide show
  1. entry.py +33 -4
entry.py CHANGED
@@ -796,11 +796,40 @@ def _gateway_watchdog(interval: int = 30):
796
 
797
  gw_pid = _find_gateway_pid()
798
  if gw_pid is not None:
799
- # Gateway is alive, reset backoff
800
- restart_backoff = 30
801
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
802
 
803
- # Gateway is dead — restart with backoff
804
  now = time.time()
805
  if now - last_restart < restart_backoff:
806
  continue
 
796
 
797
  gw_pid = _find_gateway_pid()
798
  if gw_pid is not None:
799
+ # Process is alive but check if gateway is FUNCTIONAL (not zombie)
800
+ # A zombie gateway has a live process but dead asyncio executor,
801
+ # causing "Executor shutdown has been called" on every send.
802
+ try:
803
+ # Check gateway log for executor shutdown errors in last 60s
804
+ now_ts = time.time()
805
+ _zombie = False
806
+ if os.path.isfile(gw_log):
807
+ with open(gw_log, "r", errors="replace") as _lf:
808
+ for _line in _lf.readlines()[-50:]:
809
+ if "Executor shutdown" in _line and "RuntimeError" in _line:
810
+ _zombie = True
811
+ break
812
+ if _zombie:
813
+ logger.error("[Watchdog] Gateway is ZOMBIE (executor shutdown) — force-restarting...")
814
+ killed = _kill_all_gateways()
815
+ if not killed:
816
+ try:
817
+ os.kill(gw_pid, 9)
818
+ except Exception:
819
+ pass
820
+ time.sleep(3)
821
+ # Fall through to restart below
822
+ else:
823
+ restart_backoff = 30
824
+ continue
825
+ except Exception:
826
+ # If log check fails, assume gateway is fine
827
+ restart_backoff = 30
828
+ continue
829
+ else:
830
+ logger.warning("[Watchdog] Gateway process is DEAD — restarting...")
831
 
832
+ # Gateway is dead or zombie — restart with backoff
833
  now = time.time()
834
  if now - last_restart < restart_backoff:
835
  continue