Spaces:
Build error
fix: 4 BLOCKERs + 6 IMPORTANTs from deep-check audit
Browse filesBLOCKERs:
- vlm.py: filter VLM-returned tokens against the closed vocabulary set.
Off-vocab strings ('letter', 'no_sign', 'n/a') were getting stamped
with a fake 0.85 confidence and leaking into the demo.
- space.py: pass the _new_session FACTORY (callable) to gr.State, not
the result. Guarantees per-tab isolation; the previous form has
observed state-sharing across browser sessions in some Gradio 4.x
configurations.
- app.py: bind 0.0.0.0 by default. HF Spaces' reverse proxy expects
the app on the container's external interface; 127.0.0.1-only
silently rejects all incoming traffic on the live Space. Local
sandboxes still override via SIGNBRIDGE_HOST=127.0.0.1.
- README.md: bump frontmatter sdk_version 4.44.0 -> 4.44.1 to match
requirements.txt pin (avoid HF Spaces gradio-skew build issues).
IMPORTANTs:
- imageio.py: apply EXIF rotation via ImageOps.exif_transpose so
phone-camera photos arrive at the VLM upright (was silently feeding
rotated images on every iPhone gold-set sample).
- imageio.py: handle float dtype properly in array_to_rgb. Naive
.astype(np.uint8) on float[0,1] was truncating to all-zeros — the
same black-frame failure mode the alpha fix already eliminated for
the path-load codepath.
- tts.py: silent-stub returns None on import failure, not '' (the
empty string was a type-annotation lie and broke gr.Audio).
- tts.py: cap retry attempts on transient XTTS load failure (3) and
use threading.Lock for the singleton — prevents a single bad cold
start from permanently muting the demo and prevents concurrent
double-loads of the 2 GB model.
- tts.py: switch cache key from abs(hash(text)) to sha256. Python's
hash() is salted per-process by default, so the cache effectively
reset every cold start; sha256 is stable across processes.
- composer/sentence.py + recognizer/vlm.py: log only exception type
name (not full message). httpx surfaces request URLs with embedded
credentials in error messages; logger.exception() was a credential-
leak surface in public HF Space stdout.
- composer/sentence.py: fall back to naive_join when the LLM returns
empty content. Previously this path returned '' silently and the UI
played no audio with no error.
68 tests still passing; ruff clean; live VLM smoke test confirms
recognizer + composer still work end-to-end (sign A -> 'A' 0.85,
composer 'Hello, my name is Lucas.').
- README.md +1 -1
- app.py +6 -4
- signbridge/composer/sentence.py +15 -4
- signbridge/imageio.py +35 -7
- signbridge/recognizer/vlm.py +15 -3
- signbridge/space.py +7 -1
- signbridge/voice/tts.py +52 -25
|
@@ -4,7 +4,7 @@ emoji: 🤟
|
|
| 4 |
colorFrom: indigo
|
| 5 |
colorTo: pink
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 4.44.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
thumbnail: assets/cover.png
|
|
|
|
| 4 |
colorFrom: indigo
|
| 5 |
colorTo: pink
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.44.1
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
thumbnail: assets/cover.png
|
|
@@ -17,10 +17,12 @@ from signbridge.space import build_demo
|
|
| 17 |
def main() -> None:
|
| 18 |
load_dotenv()
|
| 19 |
demo = build_demo()
|
| 20 |
-
#
|
| 21 |
-
#
|
| 22 |
-
#
|
| 23 |
-
|
|
|
|
|
|
|
| 24 |
port = int(os.getenv("SIGNBRIDGE_PORT", "7860"))
|
| 25 |
demo.launch(
|
| 26 |
server_name=host,
|
|
|
|
| 17 |
def main() -> None:
|
| 18 |
load_dotenv()
|
| 19 |
demo = build_demo()
|
| 20 |
+
# Bind 0.0.0.0 by default — HF Spaces' reverse proxy expects the app to
|
| 21 |
+
# listen on the container's external interface, and 127.0.0.1-only would
|
| 22 |
+
# silently reject all incoming requests on the live Space. For local dev
|
| 23 |
+
# boot-tests inside sandboxes that can't talk to 0.0.0.0, override with
|
| 24 |
+
# `SIGNBRIDGE_HOST=127.0.0.1`.
|
| 25 |
+
host = os.getenv("SIGNBRIDGE_HOST", "0.0.0.0")
|
| 26 |
port = int(os.getenv("SIGNBRIDGE_PORT", "7860"))
|
| 27 |
demo.launch(
|
| 28 |
server_name=host,
|
|
@@ -125,12 +125,23 @@ def compose_sentence(signs: Sequence[str]) -> str:
|
|
| 125 |
temperature=0.2,
|
| 126 |
max_tokens=120,
|
| 127 |
)
|
| 128 |
-
text = resp.choices[0].message.content or ""
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
|
|
|
|
|
|
| 132 |
return _naive_join(signs)
|
| 133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
def _strip_quotes(text: str) -> str:
|
| 136 |
return re.sub(r'^["\']|["\']$', "", text).strip()
|
|
|
|
| 125 |
temperature=0.2,
|
| 126 |
max_tokens=120,
|
| 127 |
)
|
| 128 |
+
text = (resp.choices[0].message.content or "").strip()
|
| 129 |
+
except Exception as exc: # noqa: BLE001 — broad catch is intentional at the boundary
|
| 130 |
+
# Log only the exception type; full message can include the request
|
| 131 |
+
# URL with embedded credentials when the OpenAI-compatible client
|
| 132 |
+
# surfaces an httpx error.
|
| 133 |
+
logger.warning("composer LLM call failed: %s", type(exc).__name__)
|
| 134 |
return _naive_join(signs)
|
| 135 |
|
| 136 |
+
cleaned = _strip_quotes(text)
|
| 137 |
+
if not cleaned:
|
| 138 |
+
# LLM returned empty content — fall back to the naive joiner so the
|
| 139 |
+
# demo still produces *something* readable instead of silently
|
| 140 |
+
# playing no audio.
|
| 141 |
+
logger.info("composer LLM returned empty content; using naive joiner.")
|
| 142 |
+
return _naive_join(signs)
|
| 143 |
+
return cleaned
|
| 144 |
+
|
| 145 |
|
| 146 |
def _strip_quotes(text: str) -> str:
|
| 147 |
return re.sub(r'^["\']|["\']$', "", text).strip()
|
|
@@ -17,10 +17,11 @@ import numpy as np
|
|
| 17 |
def load_rgb(source: str | Path | bytes | io.IOBase) -> np.ndarray:
|
| 18 |
"""Load an image as an RGB ndarray, compositing any alpha onto white.
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
|
|
|
| 22 |
"""
|
| 23 |
-
from PIL import Image
|
| 24 |
|
| 25 |
if isinstance(source, (str, Path)):
|
| 26 |
img = Image.open(source)
|
|
@@ -29,6 +30,14 @@ def load_rgb(source: str | Path | bytes | io.IOBase) -> np.ndarray:
|
|
| 29 |
else:
|
| 30 |
img = Image.open(source)
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
return _composite_to_rgb(img)
|
| 33 |
|
| 34 |
|
|
@@ -36,21 +45,40 @@ def array_to_rgb(arr: np.ndarray) -> np.ndarray:
|
|
| 36 |
"""Convert an arbitrary-shape ndarray (H,W,3 or H,W,4) to RGB on white.
|
| 37 |
|
| 38 |
Used at the recognizer's API boundary in case a caller hands us a
|
| 39 |
-
pre-decoded RGBA array.
|
|
|
|
|
|
|
| 40 |
"""
|
| 41 |
from PIL import Image
|
| 42 |
|
| 43 |
if arr.ndim == 2:
|
| 44 |
-
img = Image.fromarray(arr).convert("RGB")
|
| 45 |
return np.asarray(img)
|
|
|
|
|
|
|
| 46 |
if arr.shape[-1] == 3:
|
| 47 |
-
return
|
| 48 |
if arr.shape[-1] == 4:
|
| 49 |
-
img = Image.fromarray(arr, mode="RGBA")
|
| 50 |
return _composite_to_rgb(img)
|
| 51 |
raise ValueError(f"unsupported array shape for RGB conversion: {arr.shape}")
|
| 52 |
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
def _composite_to_rgb(img) -> np.ndarray: # noqa: ANN001
|
| 55 |
from PIL import Image
|
| 56 |
|
|
|
|
| 17 |
def load_rgb(source: str | Path | bytes | io.IOBase) -> np.ndarray:
|
| 18 |
"""Load an image as an RGB ndarray, compositing any alpha onto white.
|
| 19 |
|
| 20 |
+
Also applies EXIF rotation so phone-camera photos arrive at the VLM
|
| 21 |
+
upright. Accepts a filesystem path, raw bytes, or any file-like object
|
| 22 |
+
PIL knows how to open.
|
| 23 |
"""
|
| 24 |
+
from PIL import Image, ImageOps
|
| 25 |
|
| 26 |
if isinstance(source, (str, Path)):
|
| 27 |
img = Image.open(source)
|
|
|
|
| 30 |
else:
|
| 31 |
img = Image.open(source)
|
| 32 |
|
| 33 |
+
# Force-load before EXIF transpose so the image is in memory and the
|
| 34 |
+
# source file/buffer can be released. Required because the rest of the
|
| 35 |
+
# pipeline holds the array, not the PIL handle.
|
| 36 |
+
img.load()
|
| 37 |
+
# Honour EXIF orientation. Phone cameras often store landscape rotation
|
| 38 |
+
# in EXIF rather than rotating the pixel data; without this every
|
| 39 |
+
# portrait-mode photo arrives at the VLM rotated 90°/180°/270°.
|
| 40 |
+
img = ImageOps.exif_transpose(img)
|
| 41 |
return _composite_to_rgb(img)
|
| 42 |
|
| 43 |
|
|
|
|
| 45 |
"""Convert an arbitrary-shape ndarray (H,W,3 or H,W,4) to RGB on white.
|
| 46 |
|
| 47 |
Used at the recognizer's API boundary in case a caller hands us a
|
| 48 |
+
pre-decoded RGBA array. Float arrays in [0, 1] are scaled to uint8;
|
| 49 |
+
naive `.astype(np.uint8)` would truncate to all-zeros (the same
|
| 50 |
+
black-frame failure mode the alpha fix already eliminated for paths).
|
| 51 |
"""
|
| 52 |
from PIL import Image
|
| 53 |
|
| 54 |
if arr.ndim == 2:
|
| 55 |
+
img = Image.fromarray(_to_uint8(arr)).convert("RGB")
|
| 56 |
return np.asarray(img)
|
| 57 |
+
if arr.ndim != 3:
|
| 58 |
+
raise ValueError(f"unsupported array shape for RGB conversion: {arr.shape}")
|
| 59 |
if arr.shape[-1] == 3:
|
| 60 |
+
return _to_uint8(arr)
|
| 61 |
if arr.shape[-1] == 4:
|
| 62 |
+
img = Image.fromarray(_to_uint8(arr), mode="RGBA")
|
| 63 |
return _composite_to_rgb(img)
|
| 64 |
raise ValueError(f"unsupported array shape for RGB conversion: {arr.shape}")
|
| 65 |
|
| 66 |
|
| 67 |
+
def _to_uint8(arr: np.ndarray) -> np.ndarray:
|
| 68 |
+
"""Coerce an ndarray to uint8 without truncating float [0, 1] to zero."""
|
| 69 |
+
if arr.dtype == np.uint8:
|
| 70 |
+
return arr
|
| 71 |
+
if np.issubdtype(arr.dtype, np.floating):
|
| 72 |
+
# Heuristic: if max is ≤ 1.0, it's a normalised [0, 1] image.
|
| 73 |
+
# Otherwise assume the caller already scaled to 0–255.
|
| 74 |
+
if arr.size and float(arr.max()) <= 1.0:
|
| 75 |
+
arr = arr * 255.0
|
| 76 |
+
return np.clip(arr, 0, 255).astype(np.uint8)
|
| 77 |
+
if np.issubdtype(arr.dtype, np.integer):
|
| 78 |
+
return np.clip(arr, 0, 255).astype(np.uint8)
|
| 79 |
+
return arr.astype(np.uint8)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
def _composite_to_rgb(img) -> np.ndarray: # noqa: ANN001
|
| 83 |
from PIL import Image
|
| 84 |
|
|
@@ -41,6 +41,11 @@ _VLM_VOCAB = (
|
|
| 41 |
"see know understand think feel happy sad tired hungry wait "
|
| 42 |
"unknown"
|
| 43 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
_PROMPT = (
|
| 46 |
"You are an expert in American Sign Language (ASL). Look at this image of a "
|
|
@@ -163,10 +168,17 @@ def recognize_sign_from_frame(frame: np.ndarray) -> tuple[str, float]:
|
|
| 163 |
)
|
| 164 |
raw = (resp.choices[0].message.content or "").strip()
|
| 165 |
token = _normalise(raw)
|
| 166 |
-
except Exception: # noqa: BLE001 — broad at the boundary on purpose
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
return "", 0.0
|
| 169 |
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
| 171 |
return "", 0.0
|
| 172 |
return token, 0.85
|
|
|
|
| 41 |
"see know understand think feel happy sad tired hungry wait "
|
| 42 |
"unknown"
|
| 43 |
)
|
| 44 |
+
# Pre-built set for membership tests at recognition time. Tokens not in this
|
| 45 |
+
# set get suppressed (confidence 0.0) — VLMs hallucinate strings like
|
| 46 |
+
# "letter", "no_sign", "n/a" that would otherwise leak into the demo with a
|
| 47 |
+
# fake 0.85 confidence.
|
| 48 |
+
_VLM_VOCAB_SET = frozenset(_VLM_VOCAB.split())
|
| 49 |
|
| 50 |
_PROMPT = (
|
| 51 |
"You are an expert in American Sign Language (ASL). Look at this image of a "
|
|
|
|
| 168 |
)
|
| 169 |
raw = (resp.choices[0].message.content or "").strip()
|
| 170 |
token = _normalise(raw)
|
| 171 |
+
except Exception as exc: # noqa: BLE001 — broad at the boundary on purpose
|
| 172 |
+
# Log only the exception type — full message can include the request
|
| 173 |
+
# URL with embedded credentials when the OpenAI-compatible client
|
| 174 |
+
# bubbles up an httpx error. We pay log fidelity to avoid leaking the
|
| 175 |
+
# provider key into a public HF Space stdout.
|
| 176 |
+
logger.warning("VLM recognition failed: %s", type(exc).__name__)
|
| 177 |
return "", 0.0
|
| 178 |
|
| 179 |
+
# Suppress any token that isn't in the closed vocabulary the prompt
|
| 180 |
+
# explicitly requested. Without this, a VLM that returns "letter" or
|
| 181 |
+
# "no_sign" would be reported as a confident prediction.
|
| 182 |
+
if token in {"", "unknown"} or token not in _VLM_VOCAB_SET:
|
| 183 |
return "", 0.0
|
| 184 |
return token, 0.85
|
|
@@ -133,7 +133,13 @@ def build_demo() -> gr.Blocks:
|
|
| 133 |
"sentence and hear it spoken aloud. Powered by AMD Instinct MI300X."
|
| 134 |
)
|
| 135 |
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
|
| 138 |
with gr.Row():
|
| 139 |
with gr.Column(scale=3):
|
|
|
|
| 133 |
"sentence and hear it spoken aloud. Powered by AMD Instinct MI300X."
|
| 134 |
)
|
| 135 |
|
| 136 |
+
# Pass the FACTORY (callable), not the result. Gradio invokes
|
| 137 |
+
# callable State values once per session — guarantees per-tab
|
| 138 |
+
# isolation. Using `gr.State(_new_session())` instead would create
|
| 139 |
+
# a single shared instance at module-load time, which has been
|
| 140 |
+
# observed to leak state across browser tabs in some Gradio 4.x
|
| 141 |
+
# configurations.
|
| 142 |
+
state = gr.State(_new_session)
|
| 143 |
|
| 144 |
with gr.Row():
|
| 145 |
with gr.Column(scale=3):
|
|
@@ -7,9 +7,11 @@ so the Gradio app still produces something playable.
|
|
| 7 |
|
| 8 |
from __future__ import annotations
|
| 9 |
|
|
|
|
| 10 |
import logging
|
| 11 |
import os
|
| 12 |
import tempfile
|
|
|
|
| 13 |
from pathlib import Path
|
| 14 |
|
| 15 |
logger = logging.getLogger(__name__)
|
|
@@ -17,37 +19,55 @@ logger = logging.getLogger(__name__)
|
|
| 17 |
DEFAULT_MODEL = os.getenv(
|
| 18 |
"SIGNBRIDGE_TTS_MODEL", "tts_models/multilingual/multi-dataset/xtts_v2"
|
| 19 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
|
| 22 |
class _TTSEngine:
|
| 23 |
def __init__(self) -> None:
|
| 24 |
self._tts = None
|
| 25 |
-
self.
|
| 26 |
-
self.
|
| 27 |
self._cache_dir = Path(tempfile.gettempdir()) / "signbridge_tts"
|
| 28 |
self._cache_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 29 |
|
| 30 |
def _ensure_loaded(self) -> None:
|
| 31 |
-
if self.
|
| 32 |
return
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
"TTS package not installed; voice output will be silent. "
|
| 39 |
"Install via `pip install TTS>=0.22`."
|
| 40 |
)
|
| 41 |
-
self.
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
def synthesize(self, text: str) -> str | None:
|
| 53 |
if not text:
|
|
@@ -56,7 +76,7 @@ class _TTSEngine:
|
|
| 56 |
if self._tts is None:
|
| 57 |
return self._silent_stub(text)
|
| 58 |
|
| 59 |
-
out_path = self._cache_dir / f"{
|
| 60 |
if out_path.exists():
|
| 61 |
return str(out_path)
|
| 62 |
|
|
@@ -67,21 +87,28 @@ class _TTSEngine:
|
|
| 67 |
language="en",
|
| 68 |
# XTTS-v2 needs a speaker reference; omit to use the default voice.
|
| 69 |
)
|
| 70 |
-
except Exception: # noqa: BLE001
|
| 71 |
-
logger.
|
|
|
|
|
|
|
| 72 |
return self._silent_stub(text)
|
| 73 |
return str(out_path)
|
| 74 |
|
| 75 |
-
def _silent_stub(self, text: str) -> str:
|
| 76 |
-
"""Emit a 0.5 s silent WAV so the Gradio audio component has something to play.
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
if out_path.exists():
|
| 79 |
return str(out_path)
|
| 80 |
try:
|
| 81 |
import numpy as np
|
| 82 |
import soundfile as sf # type: ignore[import-not-found]
|
| 83 |
except ImportError:
|
| 84 |
-
return
|
| 85 |
sf.write(str(out_path), np.zeros(8000, dtype="int16"), 16000)
|
| 86 |
return str(out_path)
|
| 87 |
|
|
|
|
| 7 |
|
| 8 |
from __future__ import annotations
|
| 9 |
|
| 10 |
+
import hashlib
|
| 11 |
import logging
|
| 12 |
import os
|
| 13 |
import tempfile
|
| 14 |
+
import threading
|
| 15 |
from pathlib import Path
|
| 16 |
|
| 17 |
logger = logging.getLogger(__name__)
|
|
|
|
| 19 |
DEFAULT_MODEL = os.getenv(
|
| 20 |
"SIGNBRIDGE_TTS_MODEL", "tts_models/multilingual/multi-dataset/xtts_v2"
|
| 21 |
)
|
| 22 |
+
# Cap on consecutive transient load failures before we give up retrying.
|
| 23 |
+
# Without this we permanently mute the demo on a single bad cold-start.
|
| 24 |
+
_MAX_LOAD_FAILURES = 3
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _cache_key(text: str) -> str:
|
| 28 |
+
"""Stable per-text cache key. Python's `hash()` is salted per-process
|
| 29 |
+
(PYTHONHASHSEED) so it changes every cold start and defeats the cache."""
|
| 30 |
+
return hashlib.sha256(text.encode("utf-8")).hexdigest()[:16]
|
| 31 |
|
| 32 |
|
| 33 |
class _TTSEngine:
|
| 34 |
def __init__(self) -> None:
|
| 35 |
self._tts = None
|
| 36 |
+
self._import_failed = False
|
| 37 |
+
self._load_failures = 0
|
| 38 |
self._cache_dir = Path(tempfile.gettempdir()) / "signbridge_tts"
|
| 39 |
self._cache_dir.mkdir(parents=True, exist_ok=True)
|
| 40 |
+
self._load_lock = threading.Lock()
|
| 41 |
|
| 42 |
def _ensure_loaded(self) -> None:
|
| 43 |
+
if self._tts is not None or self._import_failed:
|
| 44 |
return
|
| 45 |
+
if self._load_failures >= _MAX_LOAD_FAILURES:
|
| 46 |
+
return
|
| 47 |
+
with self._load_lock:
|
| 48 |
+
# Re-check after acquiring lock (double-checked locking).
|
| 49 |
+
if self._tts is not None or self._import_failed:
|
| 50 |
+
return
|
| 51 |
+
try:
|
| 52 |
+
from TTS.api import TTS # type: ignore[import-not-found]
|
| 53 |
+
except ImportError:
|
| 54 |
+
logger.warning(
|
| 55 |
"TTS package not installed; voice output will be silent. "
|
| 56 |
"Install via `pip install TTS>=0.22`."
|
| 57 |
)
|
| 58 |
+
self._import_failed = True
|
| 59 |
+
return
|
| 60 |
+
|
| 61 |
+
try:
|
| 62 |
+
self._tts = TTS(model_name=DEFAULT_MODEL, progress_bar=False)
|
| 63 |
+
except Exception as exc: # noqa: BLE001
|
| 64 |
+
self._load_failures += 1
|
| 65 |
+
logger.warning(
|
| 66 |
+
"XTTS-v2 load failed (attempt %d/%d): %s",
|
| 67 |
+
self._load_failures,
|
| 68 |
+
_MAX_LOAD_FAILURES,
|
| 69 |
+
type(exc).__name__,
|
| 70 |
+
)
|
| 71 |
|
| 72 |
def synthesize(self, text: str) -> str | None:
|
| 73 |
if not text:
|
|
|
|
| 76 |
if self._tts is None:
|
| 77 |
return self._silent_stub(text)
|
| 78 |
|
| 79 |
+
out_path = self._cache_dir / f"{_cache_key(text)}.wav"
|
| 80 |
if out_path.exists():
|
| 81 |
return str(out_path)
|
| 82 |
|
|
|
|
| 87 |
language="en",
|
| 88 |
# XTTS-v2 needs a speaker reference; omit to use the default voice.
|
| 89 |
)
|
| 90 |
+
except Exception as exc: # noqa: BLE001
|
| 91 |
+
logger.warning(
|
| 92 |
+
"XTTS synthesis failed (%s); emitting silent stub.", type(exc).__name__
|
| 93 |
+
)
|
| 94 |
return self._silent_stub(text)
|
| 95 |
return str(out_path)
|
| 96 |
|
| 97 |
+
def _silent_stub(self, text: str) -> str | None:
|
| 98 |
+
"""Emit a 0.5 s silent WAV so the Gradio audio component has something to play.
|
| 99 |
+
|
| 100 |
+
Returns None when even the stub can't be written (numpy/soundfile not
|
| 101 |
+
available); callers must handle None and present a clean "no audio"
|
| 102 |
+
UI rather than feeding "" into a Gradio Audio component.
|
| 103 |
+
"""
|
| 104 |
+
out_path = self._cache_dir / f"silent_{_cache_key(text)}.wav"
|
| 105 |
if out_path.exists():
|
| 106 |
return str(out_path)
|
| 107 |
try:
|
| 108 |
import numpy as np
|
| 109 |
import soundfile as sf # type: ignore[import-not-found]
|
| 110 |
except ImportError:
|
| 111 |
+
return None
|
| 112 |
sf.write(str(out_path), np.zeros(8000, dtype="int16"), 16000)
|
| 113 |
return str(out_path)
|
| 114 |
|