| """OOM 处理:MPS/CUDA 显存或 CPU 内存不足时退出进程,由进程管理器重启""" |
| import os |
| import threading |
| import time |
|
|
|
|
| def _check_oom_msg(msg: str) -> bool: |
| patterns = ( |
| "out of memory", |
| "out of memory error", |
| "memory allocation", |
| "cannot allocate memory", |
| "insufficient memory", |
| "ran out of memory", |
| "resource exhausted", |
| "cuda error: out of memory", |
| "mps backend out of memory", |
| ) |
| return any(p in msg.lower() for p in patterns) |
|
|
|
|
| def is_oom_error(e: Exception) -> bool: |
| """检测是否为 OOM(含 MPS/CUDA 显存、CPU 内存),此类错误后进程无法恢复,需重启""" |
| if isinstance(e, MemoryError): |
| return True |
| if _check_oom_msg(str(e)): |
| return True |
| |
| for exc in (getattr(e, "__cause__", None), getattr(e, "__context__", None)): |
| if exc is not None and (isinstance(exc, MemoryError) or _check_oom_msg(str(exc))): |
| return True |
| return False |
|
|
|
|
| def exit_if_oom(e: Exception, defer_seconds: float = 0) -> None: |
| """若为 OOM 则退出进程,由进程管理器重启以恢复内存。 |
| |
| defer_seconds: 延迟退出秒数,用于先返回错误响应再退出(非流式需 > 0) |
| """ |
| if not is_oom_error(e): |
| return |
| msg = f"🛑 OOM 检测到,进程退出以便重启: {e}" |
| if defer_seconds > 0: |
| msg = f"🛑 OOM 检测到,{defer_seconds}s 后进程退出以便重启: {e}" |
| print(msg) |
|
|
| def _exit(): |
| if defer_seconds > 0: |
| time.sleep(defer_seconds) |
| os._exit(1) |
|
|
| if defer_seconds > 0: |
| threading.Thread(target=_exit, daemon=False).start() |
| else: |
| os._exit(1) |
|
|