Optimized server for concurrency and added full engine warmup
Browse files- .gitignore +2 -0
- App/backend/main.py +29 -6
.gitignore
CHANGED
|
@@ -19,3 +19,5 @@ App/backend/*.wav
|
|
| 19 |
.vscode
|
| 20 |
.idea
|
| 21 |
.DS_Store
|
|
|
|
|
|
|
|
|
| 19 |
.vscode
|
| 20 |
.idea
|
| 21 |
.DS_Store
|
| 22 |
+
# Test outputs
|
| 23 |
+
test_outputs/
|
App/backend/main.py
CHANGED
|
@@ -23,15 +23,25 @@ log = logging.getLogger("uvicorn.error")
|
|
| 23 |
BACKEND_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 24 |
MODELS_DIR = os.path.join(BACKEND_DIR, "models")
|
| 25 |
|
| 26 |
-
# Monkey-patch ONNX Runtime for better provider selection
|
| 27 |
_original_inf_session = onnxruntime.InferenceSession
|
| 28 |
def _patched_inf_session(path_or_bytes, sess_options=None, providers=None, *args, **kwargs):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
if providers is None or providers == ['CPUExecutionProvider']:
|
| 30 |
available = onnxruntime.get_available_providers()
|
|
|
|
| 31 |
preferred = [p for p in ['CUDAExecutionProvider', 'DmlExecutionProvider', 'CoreMLExecutionProvider'] if p in available]
|
| 32 |
preferred.append('CPUExecutionProvider')
|
| 33 |
providers = preferred
|
| 34 |
-
|
|
|
|
| 35 |
onnxruntime.InferenceSession = _patched_inf_session
|
| 36 |
|
| 37 |
# Load engines lazily
|
|
@@ -101,11 +111,24 @@ def synthesize_supertonic(text: str, voice_id: str, speed: float = 1.0, quality:
|
|
| 101 |
async def lifespan(app: FastAPI):
|
| 102 |
log.info("Warming up TTS engines...")
|
| 103 |
try:
|
| 104 |
-
#
|
| 105 |
-
|
| 106 |
-
log.info("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
except Exception as e:
|
| 108 |
-
log.warning(f"
|
| 109 |
yield
|
| 110 |
|
| 111 |
app = FastAPI(lifespan=lifespan)
|
|
|
|
| 23 |
BACKEND_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 24 |
MODELS_DIR = os.path.join(BACKEND_DIR, "models")
|
| 25 |
|
| 26 |
+
# Monkey-patch ONNX Runtime for better provider selection and thread management
|
| 27 |
_original_inf_session = onnxruntime.InferenceSession
|
| 28 |
def _patched_inf_session(path_or_bytes, sess_options=None, providers=None, *args, **kwargs):
|
| 29 |
+
if sess_options is None:
|
| 30 |
+
sess_options = onnxruntime.SessionOptions()
|
| 31 |
+
|
| 32 |
+
# Crucial for shared server environments: Limit threads per request to allow better
|
| 33 |
+
# concurrency without CPU lockup (especially on Hugging Face free spaces)
|
| 34 |
+
sess_options.intra_op_num_threads = 1
|
| 35 |
+
sess_options.inter_op_num_threads = 1
|
| 36 |
+
|
| 37 |
if providers is None or providers == ['CPUExecutionProvider']:
|
| 38 |
available = onnxruntime.get_available_providers()
|
| 39 |
+
# Prefer GPU if available
|
| 40 |
preferred = [p for p in ['CUDAExecutionProvider', 'DmlExecutionProvider', 'CoreMLExecutionProvider'] if p in available]
|
| 41 |
preferred.append('CPUExecutionProvider')
|
| 42 |
providers = preferred
|
| 43 |
+
|
| 44 |
+
return _original_inf_session(path_or_bytes, sess_options=sess_options, providers=providers, *args, **kwargs)
|
| 45 |
onnxruntime.InferenceSession = _patched_inf_session
|
| 46 |
|
| 47 |
# Load engines lazily
|
|
|
|
| 111 |
async def lifespan(app: FastAPI):
|
| 112 |
log.info("Warming up TTS engines...")
|
| 113 |
try:
|
| 114 |
+
# 1. Warm up Supertonic (Premium)
|
| 115 |
+
get_supertonic()
|
| 116 |
+
log.info("Supertonic engine ready.")
|
| 117 |
+
|
| 118 |
+
# 2. Warm up Standard Piper (US Female)
|
| 119 |
+
get_piper_voice("en_US-lessac-low")
|
| 120 |
+
log.info("Piper (standard) ready.")
|
| 121 |
+
|
| 122 |
+
# 3. Warm up Multi-speaker Piper (LibriTTS) - This is the default in UI
|
| 123 |
+
get_piper_voice("en_US-libritts_r-medium")
|
| 124 |
+
log.info("Piper (multi-speaker) ready.")
|
| 125 |
+
|
| 126 |
+
# 4. Perform a tiny synthesis to ensure inference graph is built
|
| 127 |
+
synthesize_piper("READY", "en_US-lessac-low", None)
|
| 128 |
+
log.info("All TTS engines fully primed.")
|
| 129 |
+
|
| 130 |
except Exception as e:
|
| 131 |
+
log.warning(f"Engine warmup incomplete: {e}")
|
| 132 |
yield
|
| 133 |
|
| 134 |
app = FastAPI(lifespan=lifespan)
|