Spaces:

ShadowHunter222
/

hello

Sleeping

App Files Files Community

ShadowHunter222 commited on 10 days ago

Commit

d61edf1

verified ·

1 Parent(s): 35e362a

Upload 6 files

Browse files

Files changed (6) hide show

Dockerfile +36 -0
app.py +915 -0
chatterbox_wrapper.py +534 -0
config.py +100 -0
requirements.txt +24 -0
text_processor.py +206 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,36 @@

+# ══════════════════════════════════════════════════════════════
+# Chatterbox Turbo TTS — CPU-Optimised Docker Image
+# ══════════════════════════════════════════════════════════════
+FROM python:3.11-slim
+# Audio codec libraries for soundfile/librosa
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends libsndfile1 ffmpeg && \
+    rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Install PyTorch CPU first (from dedicated index for smaller size)
+RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu
+# Install remaining dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY config.py text_processor.py chatterbox_wrapper.py app.py ./
+# Pre-download ONNX models + tokenizer at build time
+RUN python -c "\
+from chatterbox_wrapper import ChatterboxWrapper; \
+ChatterboxWrapper(download_only=True); \
+print('Models pre-downloaded successfully')"
+# Prevent thread oversubscription in production
+ENV OMP_NUM_THREADS=1
+ENV MKL_NUM_THREADS=1
+ENV OPENBLAS_NUM_THREADS=1
+EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

app.py ADDED Viewed

	@@ -0,0 +1,915 @@

+"""
+Chatterbox Turbo TTS -- FastAPI Server
+======================================
+Production-ready API with true real-time MP3 streaming,
+in-memory voice cloning, and fully non-blocking inference.
+Endpoints:
+  GET  /health              -> health check + optional warmup
+  GET  /info                -> model info, supported tags, parameters
+  POST /tts                 -> full audio response (WAV/MP3/FLAC)
+  POST /tts/stream          -> chunked MP3 streaming (MediaSource-ready)
+  POST /tts/true-stream     -> alias for /tts/stream (Kokoro compat)
+  POST /tts/stop/{stream_id}-> cancel a specific active stream
+  POST /tts/stop            -> cancel ALL active streams
+  POST /v1/audio/speech     -> OpenAI-compatible streaming
+"""
+import asyncio
+import io
+import json
+import logging
+import queue as stdlib_queue
+import threading
+import time
+import urllib.error
+import urllib.parse
+import urllib.request
+import uuid
+from concurrent.futures import ThreadPoolExecutor
+from typing import Generator, Optional
+import numpy as np
+import soundfile as sf
+from fastapi import FastAPI, File, Form, HTTPException, Request, UploadFile
+from fastapi.responses import Response, StreamingResponse
+from contextlib import asynccontextmanager
+from config import Config
+from chatterbox_wrapper import ChatterboxWrapper, GenerationCancelled, VoiceProfile
+import text_processor
+# ── Logging ───────────────────────────────────────────────────────
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s │ %(levelname)-7s │ %(name)s │ %(message)s",
+    datefmt="%H:%M:%S",
+)
+logger = logging.getLogger(__name__)
+# ── Thread pool for CPU-bound inference ───────────────────────────
+tts_executor = ThreadPoolExecutor(max_workers=Config.MAX_WORKERS)
+# ── Lifespan ──────────────────────────────────────────────────────
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    try:
+        wrapper = ChatterboxWrapper()
+        app.state.wrapper = wrapper
+        logger.info("✅ Model loaded, server ready")
+    except Exception as e:
+        logger.error(f"❌ Model loading failed: {e}")
+        raise
+    yield
+    tts_executor.shutdown(wait=False)
+app = FastAPI(
+    title="Chatterbox Turbo TTS API",
+    version="1.0.0",
+    docs_url="/docs",
+    lifespan=lifespan,
+)
+# ── CORS Middleware ───────────────────────────────────────────────
+@app.middleware("http")
+async def cors_middleware(request: Request, call_next):
+    origin = request.headers.get("origin")
+    # Preflight
+    if request.method == "OPTIONS" and origin in Config.ALLOWED_ORIGINS:
+        return Response(
+            status_code=200,
+            headers={
+                "Access-Control-Allow-Origin": origin,
+                "Access-Control-Allow-Methods": "*",
+                "Access-Control-Allow-Headers": "*",
+                "Access-Control-Allow-Credentials": "true",
+            },
+        )
+    if not origin or origin in Config.ALLOWED_ORIGINS:
+        response = await call_next(request)
+        if origin:
+            response.headers["Access-Control-Allow-Origin"] = origin
+            response.headers["Access-Control-Allow-Credentials"] = "true"
+            response.headers["Access-Control-Allow-Methods"] = "*"
+            response.headers["Access-Control-Allow-Headers"] = "*"
+            response.headers["Access-Control-Expose-Headers"] = "X-Stream-Id"
+        return response
+    logger.warning(f"🚫 Blocked origin: {origin}")
+    return Response(status_code=403, content="Forbidden: Origin not allowed")
+# ═══════════════════════════════════════════════════════════════════
+# Helper: resolve voice from optional upload
+# ═══════════════════════════════════════════════════════════════════
+async def _resolve_voice(
+    voice_ref: Optional[UploadFile],
+    wrapper: ChatterboxWrapper,
+) -> VoiceProfile:
+    """Return a VoiceProfile from uploaded audio or the default voice."""
+    if voice_ref is None or voice_ref.filename == "":
+        return wrapper.default_voice
+    audio_bytes = await voice_ref.read()
+    if len(audio_bytes) > Config.MAX_VOICE_UPLOAD_BYTES:
+        raise HTTPException(status_code=413, detail="Voice file too large (max 10 MB)")
+    if len(audio_bytes) == 0:
+        raise HTTPException(status_code=400, detail="Empty voice file")
+    loop = asyncio.get_running_loop()
+    try:
+        return await loop.run_in_executor(
+            tts_executor, wrapper.encode_voice_from_bytes, audio_bytes
+        )
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    except Exception as e:
+        logger.error(f"Voice encoding failed: {e}")
+        raise HTTPException(
+            status_code=400,
+            detail=f"Could not process voice file: {str(e)}. "
+                   f"Supported formats: WAV, MP3, MPEG, M4A, OGG, FLAC, WebM."
+        )
+# ═══════════════════════════════════════════════════════════════════
+# Helper: encode numpy audio to bytes in given format
+# ═══════════════════════════════════════════════════════════════════
+def _encode_audio(audio: np.ndarray, fmt: str = "wav") -> tuple[bytes, str]:
+    buf = io.BytesIO()
+    fmt_lower = fmt.lower()
+    if fmt_lower == "mp3":
+        sf.write(buf, audio, Config.SAMPLE_RATE, format="mp3")
+        media = "audio/mpeg"
+    elif fmt_lower == "flac":
+        sf.write(buf, audio, Config.SAMPLE_RATE, format="flac")
+        media = "audio/flac"
+    else:
+        sf.write(buf, audio, Config.SAMPLE_RATE, format="wav")
+        media = "audio/wav"
+    return buf.getvalue(), media
+def _encode_mp3_chunk(audio: np.ndarray) -> bytes:
+    """Encode one numpy chunk to MP3 bytes (same encoder path as current server)."""
+    data, _ = _encode_audio(audio, fmt="mp3")
+    return data
+def _build_helper_endpoint(base_url: str, path: str) -> str:
+    return f"{base_url.rstrip('/')}{path}"
+def _internal_headers() -> dict[str, str]:
+    headers = {"Content-Type": "application/json", "Accept": "audio/mpeg"}
+    if Config.INTERNAL_SHARED_SECRET:
+        headers["X-Internal-Secret"] = Config.INTERNAL_SHARED_SECRET
+    return headers
+def _helper_request_chunk(
+    helper_base_url: str,
+    payload: dict,
+    timeout_sec: float,
+) -> bytes:
+    url = _build_helper_endpoint(helper_base_url, "/internal/chunk/synthesize")
+    body = json.dumps(payload).encode("utf-8")
+    req = urllib.request.Request(
+        url=url,
+        data=body,
+        headers=_internal_headers(),
+        method="POST",
+    )
+    with urllib.request.urlopen(req, timeout=timeout_sec) as resp:
+        return resp.read()
+def _helper_register_voice(
+    helper_base_url: str,
+    stream_id: str,
+    audio_bytes: bytes,
+    timeout_sec: float,
+) -> str:
+    """Register reference voice on helper once, return voice_key for chunk calls."""
+    query = urllib.parse.urlencode({"stream_id": stream_id})
+    url = _build_helper_endpoint(helper_base_url, f"/internal/voice/register?{query}")
+    headers = {"Content-Type": "application/octet-stream", "Accept": "application/json"}
+    if Config.INTERNAL_SHARED_SECRET:
+        headers["X-Internal-Secret"] = Config.INTERNAL_SHARED_SECRET
+    req = urllib.request.Request(
+        url=url,
+        data=audio_bytes,
+        headers=headers,
+        method="POST",
+    )
+    with urllib.request.urlopen(req, timeout=timeout_sec) as resp:
+        data = json.loads(resp.read().decode("utf-8"))
+    voice_key = (data.get("voice_key") or "").strip()
+    if not voice_key:
+        raise RuntimeError("helper voice registration returned no voice_key")
+    return voice_key
+def _helper_cancel_stream(helper_base_url: str, stream_id: str):
+    """Best-effort cancellation signal to helper."""
+    try:
+        url = _build_helper_endpoint(helper_base_url, f"/internal/chunk/cancel/{stream_id}")
+        req = urllib.request.Request(
+            url=url,
+            data=b"",
+            headers=_internal_headers(),
+            method="POST",
+        )
+        with urllib.request.urlopen(req, timeout=3.0):
+            pass
+    except Exception:
+        pass
+# ═══════════════════════════════════════════════════════════════════
+# Endpoints
+# ═══════════════════════════════════════════════════════════════════
+@app.get("/health")
+async def health(warm_up: bool = False):
+    wrapper: ChatterboxWrapper = getattr(app.state, "wrapper", None)
+    status = {
+        "status": "healthy" if wrapper else "loading",
+        "model_loaded": wrapper is not None,
+        "model_dtype": Config.MODEL_DTYPE,
+        "streaming_supported": True,
+        "voice_cache_entries": wrapper._voice_cache.size if wrapper else 0,
+    }
+    if warm_up and wrapper:
+        try:
+            loop = asyncio.get_running_loop()
+            await loop.run_in_executor(tts_executor, wrapper.warmup)
+            status["warm_up"] = "success"
+        except Exception as e:
+            status["warm_up"] = f"failed: {e}"
+    return status
+@app.get("/info")
+async def info():
+    return {
+        "model": Config.MODEL_ID,
+        "dtype": Config.MODEL_DTYPE,
+        "sample_rate": Config.SAMPLE_RATE,
+        "paralinguistic_tags": list(Config.PARALINGUISTIC_TAGS),
+        "tag_usage": "Insert tags directly in text, e.g. 'That is so funny! [laugh] Anyway…'",
+        "parameters": {
+            "max_new_tokens": {"default": Config.MAX_NEW_TOKENS, "range": "64–2048"},
+            "repetition_penalty": {"default": Config.REPETITION_PENALTY, "range": "1.0–2.0"},
+        },
+        "voice_cloning": {
+            "description": "Upload 3–30s reference WAV/MP3 as 'voice_ref' field",
+            "max_upload_mb": Config.MAX_VOICE_UPLOAD_BYTES // (1024 * 1024),
+        },
+        "parallel_mode": {
+            "enabled": Config.ENABLE_PARALLEL_MODE,
+            "helper_configured": bool(Config.HELPER_BASE_URL),
+            "helper_base_url": Config.HELPER_BASE_URL or None,
+            "supports_voice_ref": True,
+        },
+    }
+# ── POST /tts ─────────────────────────────────────────────────────
+@app.post("/tts", response_class=Response)
+async def text_to_speech(
+    text: str = Form(...),
+    voice_ref: Optional[UploadFile] = File(None),
+    output_format: str = Form("wav"),
+    max_new_tokens: int = Form(Config.MAX_NEW_TOKENS),
+    repetition_penalty: float = Form(Config.REPETITION_PENALTY),
+):
+    """Generate complete audio for the given text."""
+    wrapper: ChatterboxWrapper = getattr(app.state, "wrapper", None)
+    if not wrapper:
+        raise HTTPException(503, "Model not loaded")
+    if not text or not text.strip():
+        raise HTTPException(400, "Text is required")
+    voice = await _resolve_voice(voice_ref, wrapper)
+    loop = asyncio.get_running_loop()
+    try:
+        audio = await loop.run_in_executor(
+            tts_executor,
+            wrapper.generate_speech,
+            text, voice, max_new_tokens, repetition_penalty,
+        )
+    except ValueError as e:
+        raise HTTPException(400, str(e))
+    except Exception as e:
+        logger.error(f"TTS error: {e}")
+        raise HTTPException(500, "Internal server error")
+    data, media_type = _encode_audio(audio, output_format)
+    return Response(
+        content=data,
+        media_type=media_type,
+        headers={"Content-Disposition": f"attachment; filename=tts_output.{output_format}"},
+    )
+# ═══════════════════════════════════════════════════════════════════
+# Active Stream Registry (for cancellation)
+# ═══════════════════════════════════════════════════════════════════
+_active_streams: dict[str, threading.Event] = {}
+_internal_cancelled_streams: set[str] = set()
+_internal_cancel_lock = threading.Lock()
+_internal_stream_voice_keys: dict[str, set[str]] = {}
+# ═══════════════════════════════════════════════════════════════════
+# Pipeline Streaming Generator
+# ═══════════════════════════════════════════════════════════════════
+def _pipeline_stream_generator(
+    wrapper: ChatterboxWrapper,
+    text: str,
+    voice: VoiceProfile,
+    max_new_tokens: int,
+    repetition_penalty: float,
+    stream_id: str,
+) -> Generator[bytes, None, None]:
+    """Two-stage producer-consumer pipeline for minimal inter-chunk gaps.
+    Architecture:
+      Producer thread (heavyweight, ~80% CPU):
+        ONNX token generation → audio decoding → raw numpy arrays → queue
+      Consumer (this generator, lightweight, ~20% CPU):
+        queue → MP3 encode → yield to HTTP response
+    Why this helps:
+      - ONNX model runs CONTINUOUSLY without waiting for MP3 encode or HTTP
+      - MP3 encoding (libsndfile, C code) releases GIL → true parallelism
+      - ONNX inference (C++ code) also releases GIL → both run simultaneously
+      - Queue(maxsize=2) lets producer stay 1-2 chunks ahead
+    Cancellation:
+      - cancel_event checked between chunks + every 25 autoregressive steps
+      - Client disconnect triggers GeneratorExit → finally sets cancel
+      - /tts/stop endpoint sets cancel externally
+    """
+    cancel_event = threading.Event()
+    _active_streams[stream_id] = cancel_event
+    # Raw audio buffer: producer puts numpy arrays, consumer takes them
+    audio_buffer: stdlib_queue.Queue = stdlib_queue.Queue(maxsize=2)
+    def _producer():
+        """Heavyweight worker: runs ONNX model continuously."""
+        try:
+            for audio_chunk in wrapper.stream_speech(
+                text, voice,
+                max_new_tokens=max_new_tokens,
+                repetition_penalty=repetition_penalty,
+                is_cancelled=cancel_event.is_set,
+            ):
+                if cancel_event.is_set():
+                    break
+                while not cancel_event.is_set():
+                    try:
+                        audio_buffer.put(audio_chunk, timeout=0.1)
+                        break
+                    except stdlib_queue.Full:
+                        continue
+        except GenerationCancelled:
+            logger.info(f"[{stream_id}] Generation cancelled")
+        except Exception as e:
+            while not cancel_event.is_set():
+                try:
+                    audio_buffer.put(e, timeout=0.1)
+                    break
+                except stdlib_queue.Full:
+                    continue
+        finally:
+            while not cancel_event.is_set():
+                try:
+                    audio_buffer.put(None, timeout=0.1)
+                    break
+                except stdlib_queue.Full:
+                    continue
+    producer = threading.Thread(target=_producer, daemon=True)
+    producer.start()
+    try:
+        # Consumer: lightweight MP3 encoding + yield
+        while True:
+            item = audio_buffer.get()
+            if item is None:
+                break
+            if isinstance(item, Exception):
+                logger.error(f"[{stream_id}] Stream error: {item}")
+                break
+            if cancel_event.is_set():
+                break
+            # MP3 encode (C code, releases GIL, runs parallel with next ONNX step)
+            buf = io.BytesIO()
+            sf.write(buf, item, Config.SAMPLE_RATE, format="mp3")
+            yield buf.getvalue()
+    finally:
+        # Cleanup: signal producer to stop + deregister
+        cancel_event.set()
+        _active_streams.pop(stream_id, None)
+def _parallel_odd_even_stream_generator(
+    wrapper: ChatterboxWrapper,
+    text: str,
+    local_voice: VoiceProfile,
+    helper_voice_bytes: Optional[bytes],
+    max_new_tokens: int,
+    repetition_penalty: float,
+    stream_id: str,
+    helper_base_url: str,
+) -> Generator[bytes, None, None]:
+    """Additive odd/even split streamer (primary handles odd, helper handles even)."""
+    cancel_event = threading.Event()
+    _active_streams[stream_id] = cancel_event
+    clean_text = text_processor.sanitize(text.strip()[: Config.MAX_TEXT_LENGTH])
+    chunks = text_processor.split_for_streaming(clean_text)
+    total_chunks = len(chunks)
+    if total_chunks == 0:
+        _active_streams.pop(stream_id, None)
+        return
+    lock = threading.Lock()
+    cond = threading.Condition(lock)
+    ready: dict[int, bytes] = {}
+    first_error: Optional[Exception] = None
+    workers_done = 0
+    def _publish(idx: int, data: bytes):
+        with cond:
+            ready[idx] = data
+            cond.notify_all()
+    def _set_error(err: Exception):
+        nonlocal first_error
+        with cond:
+            if first_error is None:
+                first_error = err
+            cond.notify_all()
+    def _worker_done():
+        nonlocal workers_done
+        with cond:
+            workers_done += 1
+            cond.notify_all()
+    def _synth_local(chunk_text: str) -> bytes:
+        audio = wrapper.generate_speech(
+            chunk_text,
+            local_voice,
+            max_new_tokens=max_new_tokens,
+            repetition_penalty=repetition_penalty,
+        )
+        return _encode_mp3_chunk(audio)
+    def _odd_worker():
+        try:
+            for idx in range(0, total_chunks, 2):
+                if cancel_event.is_set():
+                    break
+                data = _synth_local(chunks[idx])
+                _publish(idx, data)
+        except Exception as e:
+            _set_error(e)
+        finally:
+            _worker_done()
+    def _even_worker():
+        helper_available = True
+        helper_voice_key: Optional[str] = None
+        try:
+            if helper_voice_bytes:
+                attempts = 2 if Config.HELPER_RETRY_ONCE else 1
+                last_err: Optional[Exception] = None
+                for _ in range(attempts):
+                    try:
+                        helper_voice_key = _helper_register_voice(
+                            helper_base_url=helper_base_url,
+                            stream_id=stream_id,
+                            audio_bytes=helper_voice_bytes,
+                            timeout_sec=max(1.0, Config.HELPER_TIMEOUT_SEC),
+                        )
+                        last_err = None
+                        break
+                    except Exception as reg_err:
+                        last_err = reg_err
+                        continue
+                if last_err is not None:
+                    helper_available = False
+                    logger.warning(
+                        f"[{stream_id}] Helper voice registration failed; "
+                        "falling back to local synthesis for even chunks"
+                    )
+            for idx in range(1, total_chunks, 2):
+                if cancel_event.is_set():
+                    break
+                if helper_available:
+                    payload = {
+                        "stream_id": stream_id,
+                        "chunk_index": idx,
+                        "text": chunks[idx],
+                        "max_new_tokens": max_new_tokens,
+                        "repetition_penalty": repetition_penalty,
+                        "output_format": "mp3",
+                    }
+                    if helper_voice_key:
+                        payload["voice_key"] = helper_voice_key
+                    attempts = 2 if Config.HELPER_RETRY_ONCE else 1
+                    last_err: Optional[Exception] = None
+                    for _ in range(attempts):
+                        try:
+                            helper_data = _helper_request_chunk(
+                                helper_base_url=helper_base_url,
+                                payload=payload,
+                                timeout_sec=max(1.0, Config.HELPER_TIMEOUT_SEC),
+                            )
+                            _publish(idx, helper_data)
+                            last_err = None
+                            break
+                        except Exception as helper_err:
+                            last_err = helper_err
+                            continue
+                    if last_err is None:
+                        continue
+                    helper_available = False
+                    logger.warning(
+                        f"[{stream_id}] Helper failed at chunk {idx}; "
+                        "falling back to local synthesis for remaining even chunks"
+                    )
+                # Local fallback for even chunks
+                data = _synth_local(chunks[idx])
+                _publish(idx, data)
+        except Exception as e:
+            _set_error(e)
+        finally:
+            _worker_done()
+    odd_thread = threading.Thread(target=_odd_worker, daemon=True)
+    even_thread = threading.Thread(target=_even_worker, daemon=True)
+    odd_thread.start()
+    even_thread.start()
+    next_idx = 0
+    try:
+        while next_idx < total_chunks:
+            with cond:
+                while (
+                    next_idx not in ready
+                    and first_error is None
+                    and not cancel_event.is_set()
+                    and workers_done < 2
+                ):
+                    cond.wait(timeout=0.1)
+                if cancel_event.is_set():
+                    break
+                if next_idx in ready:
+                    data = ready.pop(next_idx)
+                elif first_error is not None:
+                    logger.error(f"[{stream_id}] Parallel stream error: {first_error}")
+                    break
+                elif workers_done >= 2:
+                    logger.error(
+                        f"[{stream_id}] Parallel stream ended with missing chunk index {next_idx}"
+                    )
+                    break
+                else:
+                    continue
+            yield data
+            next_idx += 1
+    finally:
+        cancel_event.set()
+        _helper_cancel_stream(helper_base_url, stream_id)
+        odd_thread.join(timeout=1.0)
+        even_thread.join(timeout=1.0)
+        _active_streams.pop(stream_id, None)
+# ── POST /tts/stream & /tts/true-stream ──────────────────────────
+@app.post("/tts/stream")
+@app.post("/tts/true-stream")
+async def stream_text_to_speech(
+    text: str = Form(...),
+    voice_ref: Optional[UploadFile] = File(None),
+    max_new_tokens: int = Form(Config.MAX_NEW_TOKENS),
+    repetition_penalty: float = Form(Config.REPETITION_PENALTY),
+):
+    """True real-time streaming: yields MP3 chunks as each sentence finishes.
+    Response includes X-Stream-Id header for cancellation via /tts/stop.
+    Compatible with frontend's MediaSource + ReadableStream pattern.
+    """
+    wrapper: ChatterboxWrapper = getattr(app.state, "wrapper", None)
+    if not wrapper:
+        raise HTTPException(503, "Model not loaded")
+    if not text or not text.strip():
+        raise HTTPException(400, "Text is required")
+    voice = await _resolve_voice(voice_ref, wrapper)
+    stream_id = uuid.uuid4().hex[:12]
+    return StreamingResponse(
+        _pipeline_stream_generator(
+            wrapper, text, voice, max_new_tokens, repetition_penalty, stream_id,
+        ),
+        media_type="audio/mpeg",
+        headers={
+            "Content-Disposition": "attachment; filename=tts_stream.mp3",
+            "Transfer-Encoding": "chunked",
+            "X-Stream-Id": stream_id,
+            "X-Streaming-Type": "true-realtime",
+            "Cache-Control": "no-cache",
+        },
+    )
+@app.post("/tts/parallel-stream")
+async def parallel_stream_text_to_speech(
+    text: str = Form(...),
+    voice_ref: Optional[UploadFile] = File(None),
+    max_new_tokens: int = Form(Config.MAX_NEW_TOKENS),
+    repetition_penalty: float = Form(Config.REPETITION_PENALTY),
+    helper_url: Optional[str] = Form(None),
+):
+    """Additive odd/even split stream mode (primary + helper)."""
+    wrapper: ChatterboxWrapper = getattr(app.state, "wrapper", None)
+    if not wrapper:
+        raise HTTPException(503, "Model not loaded")
+    if not Config.ENABLE_PARALLEL_MODE:
+        raise HTTPException(503, "Parallel mode is disabled")
+    if not text or not text.strip():
+        raise HTTPException(400, "Text is required")
+    local_voice: VoiceProfile = wrapper.default_voice
+    helper_voice_bytes: Optional[bytes] = None
+    if voice_ref is not None and voice_ref.filename:
+        helper_voice_bytes = await voice_ref.read()
+        if len(helper_voice_bytes) > Config.MAX_VOICE_UPLOAD_BYTES:
+            raise HTTPException(status_code=413, detail="Voice file too large (max 10 MB)")
+        if len(helper_voice_bytes) == 0:
+            raise HTTPException(status_code=400, detail="Empty voice file")
+        loop = asyncio.get_running_loop()
+        try:
+            local_voice = await loop.run_in_executor(
+                tts_executor, wrapper.encode_voice_from_bytes, helper_voice_bytes
+            )
+        except Exception as e:
+            logger.error(f"Parallel voice encoding failed: {e}")
+            raise HTTPException(400, "Could not process voice file for parallel mode")
+    resolved_helper = (helper_url or Config.HELPER_BASE_URL).strip()
+    if not resolved_helper:
+        raise HTTPException(
+            400,
+            "Helper URL not configured. Set CB_HELPER_BASE_URL or pass helper_url.",
+        )
+    stream_id = uuid.uuid4().hex[:12]
+    return StreamingResponse(
+        _parallel_odd_even_stream_generator(
+            wrapper=wrapper,
+            text=text,
+            local_voice=local_voice,
+            helper_voice_bytes=helper_voice_bytes,
+            max_new_tokens=max_new_tokens,
+            repetition_penalty=repetition_penalty,
+            stream_id=stream_id,
+            helper_base_url=resolved_helper,
+        ),
+        media_type="audio/mpeg",
+        headers={
+            "Content-Disposition": "attachment; filename=tts_parallel_stream.mp3",
+            "Transfer-Encoding": "chunked",
+            "X-Stream-Id": stream_id,
+            "X-Streaming-Type": "parallel-odd-even",
+            "Cache-Control": "no-cache",
+        },
+    )
+# ── JSON body variant (Kokoro/OpenAI compatibility) ───────────────
+from pydantic import BaseModel, Field
+class InternalChunkRequest(BaseModel):
+    stream_id: str = Field(..., min_length=1, max_length=64)
+    chunk_index: int = Field(..., ge=0)
+    text: str = Field(..., min_length=1, max_length=10000)
+    max_new_tokens: int = Field(default=Config.MAX_NEW_TOKENS, ge=64, le=2048)
+    repetition_penalty: float = Field(default=Config.REPETITION_PENALTY, ge=1.0, le=2.0)
+    output_format: str = Field(default="mp3")
+    voice_key: Optional[str] = Field(default=None, min_length=1, max_length=64)
+class TTSJsonRequest(BaseModel):
+    text: str = Field(..., min_length=1, max_length=50000)
+    voice: str = Field(default="default")
+    speed: float = Field(default=1.0, ge=0.5, le=2.0)  # reserved for future use
+    max_new_tokens: int = Field(default=Config.MAX_NEW_TOKENS, ge=64, le=2048)
+    repetition_penalty: float = Field(default=Config.REPETITION_PENALTY, ge=1.0, le=2.0)
+@app.post("/internal/voice/register")
+async def internal_voice_register(http_request: Request):
+    """Register voice once for a stream; returns reusable voice_key."""
+    if Config.INTERNAL_SHARED_SECRET:
+        provided = http_request.headers.get("X-Internal-Secret", "")
+        if provided != Config.INTERNAL_SHARED_SECRET:
+            raise HTTPException(403, "Forbidden")
+    wrapper: ChatterboxWrapper = getattr(app.state, "wrapper", None)
+    if not wrapper:
+        raise HTTPException(503, "Model not loaded")
+    audio_bytes = await http_request.body()
+    if len(audio_bytes) > Config.MAX_VOICE_UPLOAD_BYTES:
+        raise HTTPException(status_code=413, detail="Voice file too large (max 10 MB)")
+    if len(audio_bytes) == 0:
+        raise HTTPException(status_code=400, detail="Empty voice file")
+    loop = asyncio.get_running_loop()
+    try:
+        voice = await loop.run_in_executor(
+            tts_executor, wrapper.encode_voice_from_bytes, audio_bytes
+        )
+    except Exception as e:
+        logger.error(f"[internal] voice register failed: {e}")
+        raise HTTPException(400, "Voice registration failed")
+    voice_key = (voice.audio_hash or "").strip()
+    if not voice_key:
+        raise HTTPException(500, "Voice key unavailable")
+    stream_id = (http_request.query_params.get("stream_id") or "").strip()
+    if stream_id:
+        with _internal_cancel_lock:
+            keys = _internal_stream_voice_keys.setdefault(stream_id, set())
+            keys.add(voice_key)
+    return {"status": "registered", "voice_key": voice_key}
+@app.post("/internal/chunk/synthesize")
+async def internal_chunk_synthesize(
+    request: InternalChunkRequest,
+    http_request: Request,
+):
+    """Internal endpoint used by primary/helper parallel routing."""
+    if Config.INTERNAL_SHARED_SECRET:
+        provided = http_request.headers.get("X-Internal-Secret", "")
+        if provided != Config.INTERNAL_SHARED_SECRET:
+            raise HTTPException(403, "Forbidden")
+    with _internal_cancel_lock:
+        if request.stream_id in _internal_cancelled_streams:
+            raise HTTPException(409, "Stream already cancelled")
+    wrapper: ChatterboxWrapper = getattr(app.state, "wrapper", None)
+    if not wrapper:
+        raise HTTPException(503, "Model not loaded")
+    voice_profile = wrapper.default_voice
+    if request.voice_key:
+        cached_voice = wrapper._voice_cache.get(request.voice_key)
+        if cached_voice is None:
+            raise HTTPException(409, "Voice key expired or not found")
+        voice_profile = cached_voice
+    loop = asyncio.get_running_loop()
+    try:
+        audio = await loop.run_in_executor(
+            tts_executor,
+            wrapper.generate_speech,
+            request.text,
+            voice_profile,
+            request.max_new_tokens,
+            request.repetition_penalty,
+        )
+    except Exception as e:
+        logger.error(f"[internal] chunk {request.chunk_index} failed: {e}")
+        raise HTTPException(500, "Chunk synthesis failed")
+    fmt = (request.output_format or "mp3").lower()
+    if fmt not in {"mp3", "wav", "flac"}:
+        fmt = "mp3"
+    data, media_type = _encode_audio(audio, fmt=fmt)
+    return Response(
+        content=data,
+        media_type=media_type,
+        headers={
+            "X-Stream-Id": request.stream_id,
+            "X-Chunk-Index": str(request.chunk_index),
+        },
+    )
+@app.post("/internal/chunk/cancel/{stream_id}")
+async def internal_chunk_cancel(stream_id: str, http_request: Request):
+    if Config.INTERNAL_SHARED_SECRET:
+        provided = http_request.headers.get("X-Internal-Secret", "")
+        if provided != Config.INTERNAL_SHARED_SECRET:
+            raise HTTPException(403, "Forbidden")
+    with _internal_cancel_lock:
+        _internal_cancelled_streams.add(stream_id)
+        _internal_stream_voice_keys.pop(stream_id, None)
+    return {"status": "cancelled", "stream_id": stream_id}
+@app.post("/v1/audio/speech")
+async def openai_compatible_tts(request: TTSJsonRequest):
+    """OpenAI-compatible streaming endpoint (JSON body, no file upload).
+    Uses the default voice. For voice cloning, use /tts/stream with FormData.
+    """
+    wrapper: ChatterboxWrapper = getattr(app.state, "wrapper", None)
+    if not wrapper:
+        raise HTTPException(503, "Model not loaded")
+    stream_id = uuid.uuid4().hex[:12]
+    return StreamingResponse(
+        _pipeline_stream_generator(
+            wrapper, request.text, wrapper.default_voice,
+            request.max_new_tokens, request.repetition_penalty, stream_id,
+        ),
+        media_type="audio/mpeg",
+        headers={
+            "Transfer-Encoding": "chunked",
+            "X-Stream-Id": stream_id,
+            "Cache-Control": "no-cache",
+        },
+    )
+# ═══════════════════════════════════════════════════════════════════
+# Stop / Cancel Endpoint
+# ═══════════════════════════════════════════════════════════════════
+@app.post("/tts/stop/{stream_id}")
+async def stop_stream(stream_id: str):
+    """Stop an active TTS stream by its ID (from X-Stream-Id header).
+    Cancels the ONNX generation loop mid-token, freeing CPU immediately.
+    """
+    event = _active_streams.get(stream_id)
+    if event:
+        event.set()
+        logger.info(f"Stream {stream_id} cancelled by client")
+        return {"status": "stopped", "stream_id": stream_id}
+    return {"status": "not_found", "stream_id": stream_id}
+@app.post("/tts/stop")
+async def stop_all_streams():
+    """Emergency stop: cancel ALL active TTS streams."""
+    count = len(_active_streams)
+    for sid, event in list(_active_streams.items()):
+        event.set()
+    _active_streams.clear()
+    logger.info(f"Stopped all streams ({count} active)")
+    return {"status": "stopped_all", "count": count}
+# ═══════════════════════════════════════════════════════════════════
+# Entrypoint
+# ════════════════════════════════���══════════════════════════════════
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host=Config.HOST, port=Config.PORT)

chatterbox_wrapper.py ADDED Viewed

	@@ -0,0 +1,534 @@

+"""
+Chatterbox Turbo TTS — ONNX Inference Wrapper
+═══════════════════════════════════════════════
+Orchestrates the 4-component ONNX pipeline:
+  embed_tokens → speech_encoder → language_model → conditional_decoder
+Optimised for lowest-latency CPU inference on 2 vCPU:
+  • Sequential execution, thread count = physical cores, no spinning
+  • Token list pre-allocation (avoids O(n²) np.concatenate in loop)
+  • In-memory voice caching (no disk writes for uploads)
+  • Robust audio loading: WAV, MP3, MPEG, M4A, OGG, FLAC, WebM
+  • Sentence-level streaming for real-time playback
+"""
+# ── Suppress harmless transformers warnings BEFORE import ─────────
+import os
+import warnings
+os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
+warnings.filterwarnings("ignore", message=".*model of type.*chatterbox.*")
+import hashlib
+import io
+import logging
+import subprocess
+import tempfile
+import time
+from collections import OrderedDict
+from dataclasses import dataclass
+from typing import Callable, Generator, Optional
+import librosa
+import numpy as np
+import onnxruntime as ort
+import soundfile as soundfile_lib
+from huggingface_hub import hf_hub_download
+from transformers import AutoTokenizer
+from config import Config
+import text_processor
+logger = logging.getLogger(__name__)
+# ── Supported audio MIME types for voice upload ───────────────────
+_SUPPORTED_AUDIO_EXTENSIONS = {
+    ".wav", ".mp3", ".mpeg", ".mpga", ".m4a", ".mp4",
+    ".ogg", ".oga", ".opus", ".flac", ".webm", ".aac", ".wma",
+}
+# ═══════════════════════════════════════════════════════════════════
+# Data Structures
+# ═══════════════════════════════════════════════════════════════════
+@dataclass
+class VoiceProfile:
+    """Cached speaker embedding extracted from reference audio."""
+    cond_emb: np.ndarray
+    prompt_token: np.ndarray
+    speaker_embeddings: np.ndarray
+    speaker_features: np.ndarray
+    audio_hash: str = ""
+class GenerationCancelled(Exception):
+    """Raised when inference is cancelled by the client."""
+    pass
+# ═══════════════════════════════════════════════════════════════════
+# LRU Voice Cache
+# ═══════════════════════════════════════════════════════════════════
+class _VoiceCache:
+    """LRU cache for VoiceProfile objects with TTL-based expiration.
+    Entries auto-expire after `ttl_seconds` (default: 1 hour).
+    Re-uploading the same voice file within the TTL window returns
+    the cached profile instantly — no re-encoding needed.
+    """
+    def __init__(self, maxsize: int, ttl_seconds: int = 3600):
+        self._cache: OrderedDict[str, tuple[VoiceProfile, float]] = OrderedDict()
+        self._maxsize = maxsize
+        self._ttl = ttl_seconds
+    def _evict_expired(self):
+        """Remove all entries older than TTL."""
+        now = time.time()
+        expired = [k for k, (_, ts) in self._cache.items() if now - ts > self._ttl]
+        for k in expired:
+            del self._cache[k]
+            logger.debug(f"Voice cache expired: {k[:8]}…")
+    def get(self, key: str) -> Optional[VoiceProfile]:
+        self._evict_expired()
+        if key in self._cache:
+            profile, ts = self._cache[key]
+            remaining = self._ttl - (time.time() - ts)
+            self._cache.move_to_end(key)
+            logger.info(f"Voice cache HIT: {key[:8]}… (expires in {remaining:.0f}s)")
+            return profile
+        return None
+    def put(self, key: str, profile: VoiceProfile):
+        self._evict_expired()
+        if key in self._cache:
+            self._cache.move_to_end(key)
+        else:
+            if len(self._cache) >= self._maxsize:
+                evicted_key, _ = self._cache.popitem(last=False)
+                logger.debug(f"Voice cache evicted (LRU): {evicted_key[:8]}…")
+        self._cache[key] = (profile, time.time())
+        logger.info(f"Voice cache STORED: {key[:8]}… (TTL: {self._ttl}s, size: {len(self._cache)})")
+    @property
+    def size(self) -> int:
+        return len(self._cache)
+# ═══════════════════════════════════════════════════════════════════
+# Audio Loading (robust multi-format support)
+# ════════════════���══════════════════════════════════════════════════
+def _load_audio_bytes(audio_bytes: bytes, sr: int = 24000) -> np.ndarray:
+    """Load audio from raw bytes, supporting WAV/MP3/MPEG/M4A/OGG/FLAC/WebM.
+    Strategy: try soundfile (fast, native) → librosa (ffmpeg backend) → ffmpeg CLI.
+    """
+    buf = io.BytesIO(audio_bytes)
+    # 1) Try soundfile (handles WAV, FLAC, OGG natively — fastest)
+    try:
+        audio, file_sr = soundfile_lib.read(buf)
+        if audio.ndim > 1:
+            audio = audio.mean(axis=1)  # stereo → mono
+        if file_sr != sr:
+            audio = librosa.resample(audio.astype(np.float32), orig_sr=file_sr, target_sr=sr)
+        return audio.astype(np.float32)
+    except Exception:
+        buf.seek(0)
+    # 2) Try librosa (handles MP3 via audioread + ffmpeg backend)
+    try:
+        audio, _ = librosa.load(buf, sr=sr, mono=True)
+        return audio.astype(np.float32)
+    except Exception:
+        buf.seek(0)
+    # 3) Fallback: use ffmpeg CLI to convert to WAV in memory
+    try:
+        proc = subprocess.run(
+            ["ffmpeg", "-i", "pipe:0", "-f", "wav", "-ac", "1", "-ar", str(sr), "pipe:1"],
+            input=audio_bytes, capture_output=True, timeout=30,
+        )
+        if proc.returncode == 0 and len(proc.stdout) > 44:
+            wav_buf = io.BytesIO(proc.stdout)
+            audio, _ = soundfile_lib.read(wav_buf)
+            return audio.astype(np.float32)
+    except Exception:
+        pass
+    raise ValueError(
+        "Could not decode audio file. Supported formats: "
+        "WAV, MP3, MPEG, M4A, OGG, FLAC, WebM, AAC. "
+        "Please upload a valid audio file."
+    )
+# ═══════════════════════════════════════════════════════════════════
+# Main Wrapper
+# ═══════════════════════════════════════════════════════════════════
+class ChatterboxWrapper:
+    def __init__(self, download_only: bool = False):
+        self.cfg = Config
+        os.makedirs(self.cfg.MODELS_DIR, exist_ok=True)
+        logger.info(f"Downloading ONNX models (dtype={self.cfg.MODEL_DTYPE}) …")
+        self._model_paths = self._download_models()
+        if download_only:
+            return
+        logger.info(
+            f"Creating ONNX Runtime sessions "
+            f"(intra_op_threads={self.cfg.CPU_THREADS}, workers={self.cfg.MAX_WORKERS}) …"
+        )
+        opts = self._make_session_options()
+        providers = ["CPUExecutionProvider"]
+        self.embed_session   = ort.InferenceSession(self._model_paths["embed_tokens"],       sess_options=opts, providers=providers)
+        self.encoder_session = ort.InferenceSession(self._model_paths["speech_encoder"],      sess_options=opts, providers=providers)
+        self.lm_session      = ort.InferenceSession(self._model_paths["language_model"],      sess_options=opts, providers=providers)
+        self.decoder_session = ort.InferenceSession(self._model_paths["conditional_decoder"], sess_options=opts, providers=providers)
+        logger.info("Loading tokenizer …")
+        self.tokenizer = AutoTokenizer.from_pretrained(self.cfg.MODEL_ID)
+        self._voice_cache = _VoiceCache(
+            maxsize=self.cfg.VOICE_CACHE_SIZE,
+            ttl_seconds=self.cfg.VOICE_CACHE_TTL_SEC,
+        )
+        logger.info("Encoding default reference voice …")
+        self.default_voice = self._load_default_voice()
+        logger.info("✅ ChatterboxWrapper ready")
+    # ─── Model download ──────────────────────────────────────────
+    def _download_models(self) -> dict:
+        """Download all 4 ONNX components + weight files from HuggingFace."""
+        components = ("conditional_decoder", "speech_encoder", "embed_tokens", "language_model")
+        paths = {}
+        for name in components:
+            paths[name] = self._download_component(name, self.cfg.MODEL_DTYPE)
+        return paths
+    def _download_component(self, name: str, dtype: str) -> str:
+        if dtype == "fp32":
+            filename = f"{name}.onnx"
+        elif dtype == "q8":
+            filename = f"{name}_quantized.onnx"
+        else:
+            filename = f"{name}_{dtype}.onnx"
+        graph = hf_hub_download(
+            self.cfg.MODEL_ID, subfolder="onnx", filename=filename,
+            cache_dir=self.cfg.MODELS_DIR,
+        )
+        # Download companion weight file
+        try:
+            hf_hub_download(
+                self.cfg.MODEL_ID, subfolder="onnx", filename=f"{filename}_data",
+                cache_dir=self.cfg.MODELS_DIR,
+            )
+        except Exception:
+            pass  # Some quantized variants embed weights in-graph
+        return graph
+    # ─── Session configuration (optimised for 2 vCPU) ─────────────
+    def _make_session_options(self) -> ort.SessionOptions:
+        opts = ort.SessionOptions()
+        # Sequential execution: no parallel graph scheduling overhead
+        opts.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
+        # Match physical cores exactly (2 for HF Space free tier)
+        opts.intra_op_num_threads = self.cfg.CPU_THREADS
+        opts.inter_op_num_threads = 1
+        # Full graph optimisations (constant folding, fusion, etc.)
+        opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+        # Disable thread spinning — wastes CPU cycles on busy-wait
+        opts.add_session_config_entry("session.intra_op.allow_spinning", "0")
+        opts.add_session_config_entry("session.inter_op.allow_spinning", "0")
+        # Enable memory optimisations
+        opts.enable_cpu_mem_arena = True
+        opts.enable_mem_pattern = True
+        opts.enable_mem_reuse = True
+        return opts
+    # ─── Default voice ────────────────────────────────────────────
+    def _load_default_voice(self) -> VoiceProfile:
+        path = hf_hub_download(
+            self.cfg.DEFAULT_VOICE_REPO,
+            filename=self.cfg.DEFAULT_VOICE_FILE,
+            cache_dir=self.cfg.MODELS_DIR,
+        )
+        audio, _ = librosa.load(path, sr=self.cfg.SAMPLE_RATE)
+        return self._encode_audio_array(audio, audio_hash="__default__")
+    # ─── Voice encoding ──────────────────────────────────────────
+    def encode_voice_from_bytes(self, audio_bytes: bytes) -> VoiceProfile:
+        """Encode reference audio from raw bytes (in-memory, no disk write).
+        Accepts: WAV, MP3, MPEG, M4A, OGG, FLAC, WebM, AAC, WMA, Opus.
+        """
+        audio_hash = hashlib.md5(audio_bytes).hexdigest()
+        cached = self._voice_cache.get(audio_hash)
+        if cached is not None:
+            logger.info(f"Voice cache hit: {audio_hash[:8]}…")
+            return cached
+        # Robust multi-format audio loading
+        audio = _load_audio_bytes(audio_bytes, sr=self.cfg.SAMPLE_RATE)
+        # Validate duration
+        duration = len(audio) / self.cfg.SAMPLE_RATE
+        if duration < self.cfg.MIN_REF_DURATION_SEC:
+            raise ValueError(
+                f"Reference audio too short ({duration:.1f}s). "
+                f"Minimum: {self.cfg.MIN_REF_DURATION_SEC}s"
+            )
+        if duration > self.cfg.MAX_REF_DURATION_SEC:
+            audio = audio[: int(self.cfg.MAX_REF_DURATION_SEC * self.cfg.SAMPLE_RATE)]
+        profile = self._encode_audio_array(audio, audio_hash=audio_hash)
+        self._voice_cache.put(audio_hash, profile)
+        return profile
+    def _encode_audio_array(self, audio: np.ndarray, audio_hash: str = "") -> VoiceProfile:
+        """Run speech_encoder on a float32 mono audio array."""
+        audio_input = audio[np.newaxis, :].astype(np.float32)
+        cond_emb, prompt_token, speaker_emb, speaker_feat = self.encoder_session.run(
+            None, {"audio_values": audio_input}
+        )
+        return VoiceProfile(
+            cond_emb=cond_emb,
+            prompt_token=prompt_token,
+            speaker_embeddings=speaker_emb,
+            speaker_features=speaker_feat,
+            audio_hash=audio_hash,
+        )
+    # ─── Full generation (non-streaming) ──────────────────────────
+    def generate_speech(
+        self,
+        text: str,
+        voice: Optional[VoiceProfile] = None,
+        max_new_tokens: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+    ) -> np.ndarray:
+        """Generate complete audio for the given text."""
+        voice = voice or self.default_voice
+        text = text_processor.sanitize(text.strip()[: self.cfg.MAX_TEXT_LENGTH])
+        if not text:
+            raise ValueError("Text is empty after sanitization")
+        tokens = self._generate_tokens(
+            text, voice,
+            max_new_tokens or self.cfg.MAX_NEW_TOKENS,
+            repetition_penalty or self.cfg.REPETITION_PENALTY,
+        )
+        return self._decode_tokens(tokens, voice)
+    # ─── Streaming generation ─────────────────────────────────────
+    def stream_speech(
+        self,
+        text: str,
+        voice: Optional[VoiceProfile] = None,
+        max_new_tokens: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        is_cancelled: Optional[Callable[[], bool]] = None,
+    ) -> Generator[np.ndarray, None, None]:
+        """Yield audio chunks sentence-by-sentence for real-time streaming.
+        Each sentence is independently processed through the full pipeline
+        so the first chunk arrives as fast as possible (low TTFB).
+        Args:
+            is_cancelled: Optional callable that returns True to abort generation.
+                          Checked between chunks and every 25 autoregressive steps.
+        """
+        voice = voice or self.default_voice
+        text = text_processor.sanitize(text.strip()[: self.cfg.MAX_TEXT_LENGTH])
+        if not text:
+            return
+        sentences = text_processor.split_for_streaming(text)
+        _max = max_new_tokens or self.cfg.MAX_NEW_TOKENS
+        _rep = repetition_penalty or self.cfg.REPETITION_PENALTY
+        _check = is_cancelled or (lambda: False)
+        for i, sentence in enumerate(sentences):
+            # Check cancellation between chunks
+            if _check():
+                logger.info("Generation cancelled by client (between chunks)")
+                return
+            if not sentence.strip():
+                continue
+            t0 = time.perf_counter()
+            try:
+                tokens = self._generate_tokens(sentence, voice, _max, _rep, _check)
+                if _check():
+                    return
+                audio = self._decode_tokens(tokens, voice)
+                elapsed = time.perf_counter() - t0
+                audio_duration = len(audio) / self.cfg.SAMPLE_RATE
+                rtf = elapsed / audio_duration if audio_duration > 0 else 0
+                logger.info(
+                    f"Chunk {i + 1}/{len(sentences)}: "
+                    f"{len(sentence)} chars → {audio_duration:.1f}s audio "
+                    f"in {elapsed:.2f}s (RTF: {rtf:.2f}x)"
+                )
+                yield audio
+            except GenerationCancelled:
+                logger.info(f"Generation cancelled mid-token at chunk {i + 1}")
+                return
+            except Exception as e:
+                logger.error(f"Error on chunk {i + 1}: {e}")
+                raise
+    # ─── Autoregressive token generation (OPTIMISED) ──────────────
+    def _generate_tokens(
+        self,
+        text: str,
+        voice: VoiceProfile,
+        max_new_tokens: int,
+        repetition_penalty: float,
+        is_cancelled: Callable[[], bool] = lambda: False,
+    ) -> np.ndarray:
+        """Run embed → LM autoregressive loop. Returns raw token array.
+        Optimisations:
+          • Token list instead of repeated np.concatenate (O(n) → O(1) append)
+          • Unique tokens set for inline repetition penalty (avoids exponential penalty bug)
+          • Pre-allocated attention mask for zero-copy slicing
+          • Correct dimensional routing for step 0 prompt processing
+        """
+        input_ids = self.tokenizer(text, return_tensors="np")["input_ids"].astype(np.int64)
+        # Pre-allocate collections
+        token_list: list[int] = [self.cfg.START_SPEECH_TOKEN]
+        unique_tokens: set[int] = {self.cfg.START_SPEECH_TOKEN}
+        penalty = repetition_penalty
+        past_key_values = None
+        attention_mask_full = None
+        seq_len = 0
+        for step in range(max_new_tokens):
+            if step > 0 and step % 25 == 0 and is_cancelled():
+                raise GenerationCancelled()
+            embeds = self.embed_session.run(None, {"input_ids": input_ids})[0]
+            if step == 0:
+                # Prepend speaker conditioning
+                embeds = np.concatenate((voice.cond_emb, embeds), axis=1)
+                batch, seq_len, _ = embeds.shape
+                past_key_values = {
+                    inp.name: np.zeros(
+                        [batch, self.cfg.NUM_KV_HEADS, 0, self.cfg.HEAD_DIM],
+                        dtype=np.float16 if inp.type == "tensor(float16)" else np.float32,
+                    )
+                    for inp in self.lm_session.get_inputs()
+                    if "past_key_values" in inp.name
+                }
+                # Pre-allocate full attention mask
+                attention_mask_full = np.ones((batch, seq_len + max_new_tokens), dtype=np.int64)
+                attention_mask = attention_mask_full[:, :seq_len]
+                # Step 0 requires position_ids matching prompt sequence length
+                position_ids = np.arange(seq_len, dtype=np.int64).reshape(batch, -1)
+            else:
+                # O(1) zero-copy slice for subsequent steps
+                attention_mask = attention_mask_full[:, : seq_len + step]
+                # Single position ID for the single new token
+                position_ids = np.array([[seq_len + step - 1]], dtype=np.int64)
+            # Language model forward pass
+            logits, *present_kv = self.lm_session.run(
+                None,
+                dict(
+                    inputs_embeds=embeds,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    **past_key_values,
+                ),
+            )
+            # ── Inline repetition penalty + token selection ───────
+            last_logits = logits[0, -1, :].copy()  # shape: (vocab_size,)
+            # Apply repetition penalty strictly to unique tokens to prevent over-penalization
+            for tok_id in unique_tokens:
+                if last_logits[tok_id] < 0:
+                    last_logits[tok_id] *= penalty
+                else:
+                    last_logits[tok_id] /= penalty
+            next_token = int(np.argmax(last_logits))
+            token_list.append(next_token)
+            unique_tokens.add(next_token)
+            if next_token == self.cfg.STOP_SPEECH_TOKEN:
+                break
+            # Update state for next step
+            input_ids = np.array([[next_token]], dtype=np.int64)
+            for j, key in enumerate(past_key_values):
+                past_key_values[key] = present_kv[j]
+        return np.array([token_list], dtype=np.int64)
+    # ─── Token → audio decoding ───────────────────────────────────
+    def _decode_tokens(self, generated: np.ndarray, voice: VoiceProfile) -> np.ndarray:
+        """Decode speech tokens to a float32 waveform at 24 kHz."""
+        # Strip START token; strip STOP token if present
+        tokens = generated[:, 1:]
+        if tokens.shape[1] > 0 and tokens[0, -1] == self.cfg.STOP_SPEECH_TOKEN:
+            tokens = tokens[:, :-1]
+        if tokens.shape[1] == 0:
+            return np.zeros(0, dtype=np.float32)
+        # Prepend prompt token + append silence
+        silence = np.full(
+            (tokens.shape[0], 3), self.cfg.SILENCE_TOKEN, dtype=np.int64
+        )
+        full_tokens = np.concatenate(
+            [voice.prompt_token, tokens, silence], axis=1
+        )
+        wav = self.decoder_session.run(
+            None,
+            {
+                "speech_tokens": full_tokens,
+                "speaker_embeddings": voice.speaker_embeddings,
+                "speaker_features": voice.speaker_features,
+            },
+        )[0].squeeze(axis=0)
+        return wav
+    # ─── Warmup ───────────────────────────────────────────────────
+    def warmup(self):
+        """Run a short inference to warm up ONNX sessions and JIT paths."""
+        try:
+            t0 = time.perf_counter()
+            _ = self.generate_speech("Hello.", self.default_voice, max_new_tokens=32)
+            logger.info(f"Warmup done in {time.perf_counter() - t0:.2f}s")
+        except Exception as e:
+            logger.warning(f"Warmup failed (non-critical): {e}")

config.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""
+Chatterbox Turbo TTS — Centralized Configuration
+═══════════════════════════════════════════════════
+Optimised for HF Space free tier (2 vCPU).
+Adjust MODEL_DTYPE to switch quantization (q8/q4/fp16/fp32).
+All settings overridable via environment variables prefixed CB_.
+"""
+import os
+_HERE = os.path.dirname(os.path.abspath(__file__))
+def _get_bool(name: str, default: bool) -> bool:
+    raw = os.getenv(name)
+    if raw is None:
+        return default
+    return raw.strip().lower() in {"1", "true", "yes", "on"}
+class Config:
+    # ── Model ────────────────────────────────────────────────────
+    MODEL_ID: str = os.getenv("CB_MODEL_ID", "ResembleAI/chatterbox-turbo-ONNX")
+    #   fp32  → highest quality, ~1.4 GB, slowest
+    #   fp16  → good quality,    ~0.7 GB
+    #   q8    → ★ recommended,   ~0.35 GB, best balance
+    #   q4    → smallest,        ~0.17 GB, fastest, slight loss
+    #   q4f16 → q4 weights + fp16 activations
+    MODEL_DTYPE: str = os.getenv("CB_MODEL_DTYPE", "q4")
+    MODELS_DIR: str = os.getenv("CB_MODELS_DIR", os.path.join(_HERE, "models"))
+    # ── ONNX Runtime CPU tuning (optimised for 2 vCPU) ───────────
+    #
+    # KEY RULE: intra_op threads MUST match physical cores.
+    #   → 4 threads on 2 cores = oversubscription = SLOWER.
+    #   → 2 threads on 2 cores = each op uses both cores perfectly.
+    #
+    # MAX_WORKERS = 1 ensures ONE inference gets both cores.
+    #   → 2 workers would split 2 cores = both requests slow.
+    #
+    CPU_THREADS: int = int(os.getenv("CB_CPU_THREADS", "2"))
+    MAX_WORKERS: int = int(os.getenv("CB_MAX_WORKERS", "1"))
+    # ── Generation defaults ──────────────────────────────────────
+    SAMPLE_RATE: int = 24000
+    MAX_NEW_TOKENS: int = int(os.getenv("CB_MAX_NEW_TOKENS", "768"))
+    REPETITION_PENALTY: float = float(os.getenv("CB_REPETITION_PENALTY", "1.2"))
+    MAX_TEXT_LENGTH: int = int(os.getenv("CB_MAX_TEXT_LENGTH", "50000"))
+    # ── Model constants (official card — do not change) ──────────
+    START_SPEECH_TOKEN: int = 6561
+    STOP_SPEECH_TOKEN: int = 6562
+    SILENCE_TOKEN: int = 4299
+    NUM_KV_HEADS: int = 16
+    HEAD_DIM: int = 64
+    # ── Paralinguistic tags (Turbo native) ───────────────────────
+    PARALINGUISTIC_TAGS: tuple = (
+        "laugh", "chuckle", "cough", "sigh", "gasp",
+        "shush", "groan", "sniff", "clear throat",
+    )
+    # ── Voice / reference audio ──────────────────────────────────
+    # NOTE: Official ResembleAI/chatterbox-turbo-ONNX has no bundled voice.
+    # The default_voice.wav is a plain audio sample from community repo
+    # (not a model — just a reference WAV, safe to use from any source).
+    DEFAULT_VOICE_REPO: str = "onnx-community/chatterbox-ONNX"
+    DEFAULT_VOICE_FILE: str = "default_voice.wav"
+    MAX_VOICE_UPLOAD_BYTES: int = 10 * 1024 * 1024   # 10 MB
+    MIN_REF_DURATION_SEC: float = 1.5
+    MAX_REF_DURATION_SEC: float = 30.0
+    VOICE_CACHE_SIZE: int = int(os.getenv("CB_VOICE_CACHE_SIZE", "20"))
+    VOICE_CACHE_TTL_SEC: int = int(os.getenv("CB_VOICE_CACHE_TTL", "3600"))  # 1 hour
+    # ── Streaming ────────────────────────────────────────────────
+    # Smaller chunks = faster TTFB (first audio arrives sooner)
+    # ~200 chars ≈ 1–2 sentences ≈ fastest first-chunk on 2 vCPU
+    MAX_CHUNK_CHARS: int = int(os.getenv("CB_MAX_CHUNK_CHARS", "100"))
+    # Additive parallel mode (odd/even split across primary/helper).
+    ENABLE_PARALLEL_MODE: bool = _get_bool("CB_ENABLE_PARALLEL_MODE", True)
+    HELPER_BASE_URL: str = os.getenv("CB_HELPER_BASE_URL", "https://shadowhunter222-hello2.hf.space").strip()
+    HELPER_TIMEOUT_SEC: float = float(os.getenv("CB_HELPER_TIMEOUT_SEC", "45"))
+    HELPER_RETRY_ONCE: bool = _get_bool("CB_HELPER_RETRY_ONCE", True)
+    # Optional shared secret for internal chunk endpoints.
+    INTERNAL_SHARED_SECRET: str = os.getenv("CB_INTERNAL_SHARED_SECRET", "").strip()
+    # ── Server ───────────────────────────────────────────────────
+    HOST: str = os.getenv("CB_HOST", "0.0.0.0")
+    PORT: int = int(os.getenv("CB_PORT", "7860"))
+    ALLOWED_ORIGINS: list = [
+        "https://toolboxesai.com",
+        "http://localhost:8788",  "http://127.0.0.1:8788",
+        "http://localhost:5502",  "http://127.0.0.1:5502",
+        "http://localhost:5501",  "http://127.0.0.1:5501",
+        "http://localhost:5500",  "http://127.0.0.1:5500",
+        "http://localhost:5173",  "http://127.0.0.1:5173",
+        "http://localhost:7860",  "http://127.0.0.1:7860",
+    ]

requirements.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+# =========================================================
+# Chatterbox Turbo TTS - Dependencies (CPU-only)
+# =========================================================
+# PyTorch CPU (required by transformers tokenizer internals)
+torch --index-url https://download.pytorch.org/whl/cpu
+# Core API
+fastapi>=0.104.1
+uvicorn[standard]>=0.24.0
+pydantic>=2.5.0
+python-multipart>=0.0.6
+# ONNX Runtime (CPU inference)
+onnxruntime>=1.17.0
+# Audio processing
+numpy>=1.24.0
+librosa>=0.10.0
+soundfile>=0.12.0
+# Tokenizer + model download
+transformers>=4.46.0
+huggingface-hub>=0.19.0

text_processor.py ADDED Viewed

	@@ -0,0 +1,206 @@

+"""
+Chatterbox Turbo TTS — Text Processor
+═══════════════════════════════════════
+Sanitizes raw input text and splits it into sentence-level chunks
+for streaming TTS.  Paralinguistic tags ([laugh], [cough], …) are
+explicitly preserved so the model can render them.
+"""
+import re
+from typing import List
+from config import Config
+# ═══════════════════════════════════════════════════════════════════
+# Pre-compiled regex patterns (compiled once at import → zero cost)
+# ═══════════════════════════════════════════════════════════════════
+# — Paralinguistic tag protector (matches [laugh], [clear throat], etc.)
+_TAG_NAMES = "|".join(re.escape(t) for t in Config.PARALINGUISTIC_TAGS)
+_RE_PARA_TAG = re.compile(rf"\[(?:{_TAG_NAMES})\]", re.IGNORECASE)
+# — Markdown / structural noise
+_RE_CODE_BLOCK   = re.compile(r"```[\s\S]*?```")
+_RE_INLINE_CODE  = re.compile(r"`([^`]+)`")
+_RE_IMAGE        = re.compile(r"!\[([^\]]*)\]\([^)]+\)")
+_RE_LINK         = re.compile(r"\[([^\]]+)\]\([^)]+\)")
+_RE_BOLD_AST     = re.compile(r"\*\*(.+?)\*\*")
+_RE_BOLD_UND     = re.compile(r"__(.+?)__")
+_RE_STRIKE       = re.compile(r"~~(.+?)~~")
+_RE_ITALIC_AST   = re.compile(r"\*(.+?)\*")
+_RE_ITALIC_UND   = re.compile(r"(?<!\w)_(.+?)_(?!\w)")
+_RE_HEADER       = re.compile(r"^#{1,6}\s+", re.MULTILINE)
+_RE_BLOCKQUOTE   = re.compile(r"^>+\s?", re.MULTILINE)
+_RE_HR           = re.compile(r"^[-*_]{3,}$", re.MULTILINE)
+_RE_BULLET       = re.compile(r"^\s*[-*+]\s+", re.MULTILINE)
+_RE_ORDERED      = re.compile(r"^\s*\d+\.\s+", re.MULTILINE)
+# — URLs, emojis, HTML entities
+_RE_URL          = re.compile(r"https?://\S+")
+_RE_EMOJI        = re.compile(
+    r"["
+    r"\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
+    r"\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF"
+    r"\U00002702-\U000027B0\U0001F900-\U0001F9FF"
+    r"\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF"
+    r"\U00002600-\U000026FF\U0000FE00-\U0000FE0F"
+    r"\U0000200D"
+    r"]+", re.UNICODE,
+)
+_RE_HTML_ENTITY  = re.compile(r"&(?:#x?[\da-fA-F]+|\w+);")
+_HTML_ENTITIES   = {
+    "&amp;": " and ", "&lt;": " less than ", "&gt;": " greater than ",
+    "&nbsp;": " ", "&quot;": '"', "&apos;": "'",
+    "&mdash;": ", ", "&ndash;": ", ", "&hellip;": ".",
+}
+# — Punctuation normalization
+_RE_REPEATED_DOT    = re.compile(r"\.{2,}")
+_RE_REPEATED_EXCLAM = re.compile(r"!{2,}")
+_RE_REPEATED_QUEST  = re.compile(r"\?{2,}")
+_RE_REPEATED_SEMI   = re.compile(r";{2,}")
+_RE_REPEATED_COLON  = re.compile(r":{2,}")
+_RE_REPEATED_COMMA  = re.compile(r",{2,}")
+_RE_REPEATED_DASH   = re.compile(r"-{3,}")
+# — Whitespace
+_RE_MULTI_SPACE      = re.compile(r"[ \t]+")
+_RE_MULTI_NEWLINE    = re.compile(r"\n{3,}")
+_RE_SPACE_BEFORE_PUN = re.compile(r"\s+([.!?,;:])")
+# — Sentence boundary (split point)
+_RE_SENTENCE_SPLIT = re.compile(r"(?<=[.!?;:])\s+")
+_MIN_MERGE_WORDS = 5
+# ═══════════════════════════════════════════════════════════════════
+# Public API
+# ═══════════════════════════════════════════════════════════════════
+def sanitize(text: str) -> str:
+    """Clean raw input for TTS while preserving paralinguistic tags."""
+    if not text:
+        return text
+    # 1. Protect paralinguistic tags by replacing with placeholders
+    tags_found: list[tuple[int, str]] = []
+    def _protect_tag(m):
+        idx = len(tags_found)
+        tags_found.append((idx, m.group(0)))
+        return f"§TAG{idx}§"
+    text = _RE_PARA_TAG.sub(_protect_tag, text)
+    # 2. Strip non-speakable structures
+    text = _RE_URL.sub("", text)
+    text = _RE_CODE_BLOCK.sub("", text)
+    text = _RE_IMAGE.sub(lambda m: m.group(1) if m.group(1) else "", text)
+    text = _RE_LINK.sub(r"\1", text)
+    text = _RE_BOLD_AST.sub(r"\1", text)
+    text = _RE_BOLD_UND.sub(r"\1", text)
+    text = _RE_STRIKE.sub(r"\1", text)
+    text = _RE_ITALIC_AST.sub(r"\1", text)
+    text = _RE_ITALIC_UND.sub(r"\1", text)
+    text = _RE_INLINE_CODE.sub(r"\1", text)
+    text = _RE_HEADER.sub("", text)
+    text = _RE_BLOCKQUOTE.sub("", text)
+    text = _RE_HR.sub("", text)
+    text = _RE_BULLET.sub("", text)
+    text = _RE_ORDERED.sub("", text)
+    # 3. Emojis, hashtags
+    text = _RE_EMOJI.sub("", text)
+    text = re.sub(r"#(\w+)", r"\1", text)
+    # 4. HTML entities
+    text = _RE_HTML_ENTITY.sub(lambda m: _HTML_ENTITIES.get(m.group(0), ""), text)
+    # 5. Collapse repeated punctuation
+    text = _RE_REPEATED_DOT.sub(".", text)
+    text = _RE_REPEATED_EXCLAM.sub("!", text)
+    text = _RE_REPEATED_QUEST.sub("?", text)
+    text = _RE_REPEATED_SEMI.sub(";", text)
+    text = _RE_REPEATED_COLON.sub(":", text)
+    text = _RE_REPEATED_COMMA.sub(",", text)
+    text = _RE_REPEATED_DASH.sub("—", text)
+    # 6. Whitespace
+    text = _RE_SPACE_BEFORE_PUN.sub(r"\1", text)
+    text = _RE_MULTI_SPACE.sub(" ", text)
+    text = _RE_MULTI_NEWLINE.sub("\n\n", text)
+    text = text.strip()
+    # 7. Restore paralinguistic tags
+    for idx, original in tags_found:
+        text = text.replace(f"§TAG{idx}§", original)
+    return text
+def split_for_streaming(text: str, max_chars: int = Config.MAX_CHUNK_CHARS) -> List[str]:
+    """Split sanitized text into sentence-level chunks for streaming.
+    Strategy:
+      1. Split on sentence-ending punctuation boundaries
+      2. Enforce max_chars per chunk (split long sentences on commas / spaces)
+      3. Merge short chunks (≤5 words) with the next to avoid tiny segments
+    """
+    if not text:
+        return []
+    # Step 1: sentence split
+    raw_chunks = _RE_SENTENCE_SPLIT.split(text)
+    raw_chunks = [c.strip() for c in raw_chunks if c.strip()]
+    # Step 2: enforce max length per chunk
+    sized: List[str] = []
+    for chunk in raw_chunks:
+        if len(chunk) <= max_chars:
+            sized.append(chunk)
+        else:
+            sized.extend(_break_long_chunk(chunk, max_chars))
+    # Step 3: merge short chunks
+    if len(sized) <= 1:
+        return sized
+    merged: List[str] = []
+    carry = ""
+    for i, chunk in enumerate(sized):
+        if carry:
+            chunk = carry + " " + chunk
+            carry = ""
+        if len(chunk.split()) <= _MIN_MERGE_WORDS and i < len(sized) - 1:
+            carry = chunk
+        else:
+            merged.append(chunk)
+    if carry:
+        if merged:
+            merged[-1] += " " + carry
+        else:
+            merged.append(carry)
+    return merged
+# ═══════════════════════════════════════════════════════════════════
+# Internal helpers
+# ═══════════════════════════════════════════════════════════════════
+def _break_long_chunk(text: str, max_chars: int) -> List[str]:
+    """Break a chunk longer than max_chars on commas or word boundaries."""
+    parts: List[str] = []
+    remaining = text
+    while len(remaining) > max_chars:
+        # Try comma first
+        pos = remaining.rfind(",", 0, max_chars)
+        if pos == -1:
+            pos = remaining.rfind(" ", 0, max_chars)
+        if pos == -1:
+            pos = max_chars  # hard break
+        segment = remaining[:pos].strip()
+        if segment:
+            parts.append(segment)
+        remaining = remaining[pos:].lstrip(", ")
+    if remaining.strip():
+        parts.append(remaining.strip())
+    return parts