Spaces:

Harshil748
/

VoiceAPI

Sleeping

App Files Files Community

Harshil748 commited on 27 days ago

Commit

24d1041

1 Parent(s): b9ffa51

Add voice cloning service and update API for voice generation

Browse files

Files changed (4) hide show

.gitignore +1 -0
requirements.txt +1 -0
src/api.py +171 -51
src/elevenlabs_service.py +126 -0

.gitignore CHANGED Viewed

@@ -32,3 +32,4 @@ Thumbs.db
 # Test outputs
 *.wav
 *.mp3

 # Test outputs
 *.wav
 *.mp3
+.env.example

requirements.txt CHANGED Viewed

@@ -22,6 +22,7 @@ pydantic
 # Utilities
 tqdm
 requests
 # Development (optional)
 pytest

 # Utilities
 tqdm
 requests
+httpx
 # Development (optional)
 pytest

src/api.py CHANGED Viewed

@@ -11,7 +11,8 @@ import io
 import time
 import logging
 import tempfile
-from typing import Optional, List
 from pathlib import Path
 import numpy as np
@@ -23,6 +24,7 @@ from fastapi import (
     BackgroundTasks,
     UploadFile,
     File,
 )
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse, FileResponse, JSONResponse
@@ -36,6 +38,7 @@ from .config import (
     get_available_voices,
     STYLE_PRESETS,
 )
 # Language mapping for XTTS voice cloning
 XTTS_LANG_MAP = {
@@ -90,8 +93,8 @@ app = FastAPI(
     """,
     version="1.0.0",
     contact={
-        "name": "Voice Tech for All Hackathon",
-        "url": "https://huggingface.co/SYSPIN",
     },
     license_info={
         "name": "CC BY 4.0",
@@ -110,6 +113,18 @@ app.add_middleware(
 # Initialize TTS Engine (lazy loading)
 _engine: Optional[TTSEngine] = None
 def get_engine() -> TTSEngine:
@@ -173,6 +188,32 @@ class CloneResponse(BaseModel):
     language: str
 class VoiceInfo(BaseModel):
     """Information about a voice"""
@@ -353,6 +394,85 @@ async def synthesize_stream(request: SynthesizeRequest):
         raise HTTPException(status_code=500, detail=str(e))
 @app.post("/clone", response_class=Response)
 async def clone_voice(
     text: str = Query(..., description="Text to synthesize with cloned voice"),
@@ -364,15 +484,19 @@ async def clone_voice(
     pitch: float = Query(1.0, description="Pitch", ge=0.5, le=2.0),
     energy: float = Query(1.0, description="Energy", ge=0.5, le=2.0),
     style: Optional[str] = Query(None, description="Style preset"),
     speaker_wav: UploadFile = File(
         ..., description="Reference speaker WAV (3-15 seconds recommended)"
     ),
 ):
     """
-    Clone a custom voice from uploaded sample using XTTS v2.
     """
-    engine = get_engine()
     lang_lower = lang.lower().strip()
     if lang_lower not in XTTS_LANG_MAP:
         supported = ", ".join(sorted(XTTS_LANG_MAP.keys()))
@@ -381,55 +505,51 @@ async def clone_voice(
             detail=f"Unsupported clone language: {lang}. Supported: {supported}",
         )
-    temp_path = None
-    try:
-        data = await speaker_wav.read()
-        if len(data) < 44:
-            raise HTTPException(status_code=400, detail="Invalid speaker_wav file")
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
-            tmp.write(data)
-            temp_path = tmp.name
-        start_time = time.time()
-        output = engine.clone_voice(
-            text=text,
-            speaker_wav_path=temp_path,
-            language_code=XTTS_LANG_MAP[lang_lower],
-            speed=speed,
-            pitch=pitch,
-            energy=energy,
-            style=style,
-            normalize_text=True,
         )
-        inference_time = time.time() - start_time
-        buffer = io.BytesIO()
-        sf.write(buffer, output.audio, output.sample_rate, format="WAV")
-        buffer.seek(0)
-        return Response(
-            content=buffer.read(),
-            media_type="audio/wav",
-            headers={
-                "X-Duration": str(output.duration),
-                "X-Sample-Rate": str(output.sample_rate),
-                "X-Language": lang_lower,
-                "X-Voice": "custom_cloned",
-                "X-Inference-Time": str(inference_time),
-            },
         )
-    except HTTPException:
-        raise
-    except Exception as e:
-        logger.error(f"Clone error: {e}")
-        raise HTTPException(status_code=500, detail=str(e))
-    finally:
-        if temp_path and os.path.exists(temp_path):
-            try:
-                os.remove(temp_path)
-            except OSError:
-                pass
 @app.get("/synthesize/get")

 import time
 import logging
 import tempfile
+import uuid
+from typing import Optional, List, Dict
 from pathlib import Path
 import numpy as np
     BackgroundTasks,
     UploadFile,
     File,
+    Form,
 )
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse, FileResponse, JSONResponse
     get_available_voices,
     STYLE_PRESETS,
 )
+from .elevenlabs_service import ElevenLabsService
 # Language mapping for XTTS voice cloning
 XTTS_LANG_MAP = {
     """,
     version="1.0.0",
     contact={
+        "name": "Harshil PAtel",
+        "url": "https://harshilpatel.me/#contact",
     },
     license_info={
         "name": "CC BY 4.0",
 # Initialize TTS Engine (lazy loading)
 _engine: Optional[TTSEngine] = None
+_elevenlabs = ElevenLabsService()
+# In-memory session voice cache for temporary cloned voice IDs
+_voice_session_cache: Dict[str, str] = {}
+ALLOWED_AUDIO_TYPES = {
+    "audio/wav",
+    "audio/x-wav",
+    "audio/mpeg",
+    "audio/mp3",
+}
+MAX_UPLOAD_BYTES = int(os.getenv("MAX_UPLOAD_BYTES", str(10 * 1024 * 1024)))
 def get_engine() -> TTSEngine:
     language: str
+def _validate_audio_upload(upload: UploadFile, raw_bytes: bytes) -> None:
+    if upload is None:
+        raise HTTPException(status_code=400, detail="speaker_wav is required")
+    filename = (upload.filename or "").lower()
+    if not filename.endswith((".wav", ".mp3")):
+        raise HTTPException(
+            status_code=400, detail="Only .wav or .mp3 files are supported"
+        )
+    content_type = upload.content_type or ""
+    if content_type and content_type not in ALLOWED_AUDIO_TYPES:
+        raise HTTPException(
+            status_code=400, detail=f"Unsupported content type: {content_type}"
+        )
+    if len(raw_bytes) == 0:
+        raise HTTPException(status_code=400, detail="Uploaded audio file is empty")
+    if len(raw_bytes) > MAX_UPLOAD_BYTES:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Audio file too large. Max allowed is {MAX_UPLOAD_BYTES // (1024 * 1024)} MB",
+        )
 class VoiceInfo(BaseModel):
     """Information about a voice"""
         raise HTTPException(status_code=500, detail=str(e))
+@app.post("/generate", response_class=StreamingResponse)
+async def generate_with_cloned_voice(
+    text: str = Form(...),
+    lang: str = Form("english"),
+    session_id: Optional[str] = Form(None),
+    speaker_wav: Optional[UploadFile] = File(None),
+):
+    """
+    Production-ready temporary voice cloning + speech generation using ElevenLabs.
+    """
+    lang_lower = (lang or "english").lower().strip()
+    local_session_id = session_id or uuid.uuid4().hex
+    fallback_voice_id = os.getenv("ELEVENLABS_FALLBACK_VOICE_ID")
+    voice_id = _voice_session_cache.get(local_session_id)
+    if voice_id is None:
+        if speaker_wav is None:
+            raise HTTPException(
+                status_code=400,
+                detail="speaker_wav is required for first request in a session",
+            )
+        audio_bytes = await speaker_wav.read()
+        _validate_audio_upload(speaker_wav, audio_bytes)
+        # Re-wrap file for downstream service after validation read
+        clone_file = UploadFile(
+            filename=speaker_wav.filename,
+            file=io.BytesIO(audio_bytes),
+            headers=speaker_wav.headers,
+        )
+        try:
+            voice_id = await _elevenlabs.clone_voice(clone_file)
+            _voice_session_cache[local_session_id] = voice_id
+            logger.info(
+                "Cloned ElevenLabs voice for session=%s voice_id=%s",
+                local_session_id,
+                voice_id,
+            )
+        except HTTPException as exc:
+            if fallback_voice_id:
+                logger.warning(
+                    "Clone failed for session=%s, using fallback voice",
+                    local_session_id,
+                )
+                voice_id = fallback_voice_id
+            else:
+                raise exc
+    try:
+        audio_bytes = _elevenlabs.generate_speech(
+            text=text, voice_id=voice_id, language=lang_lower
+        )
+    except HTTPException as exc:
+        if fallback_voice_id and voice_id != fallback_voice_id:
+            logger.warning(
+                "TTS failed for voice=%s, retrying with fallback voice", voice_id
+            )
+            audio_bytes = _elevenlabs.generate_speech(
+                text=text,
+                voice_id=fallback_voice_id,
+                language=lang_lower,
+            )
+        else:
+            raise exc
+    headers = {
+        "Content-Disposition": "attachment; filename=generated.mp3",
+        "X-Session-Id": local_session_id,
+        "X-Voice-Id": voice_id,
+        "X-Provider": "elevenlabs",
+    }
+    return StreamingResponse(
+        io.BytesIO(audio_bytes), media_type="audio/mpeg", headers=headers
+    )
 @app.post("/clone", response_class=Response)
 async def clone_voice(
     text: str = Query(..., description="Text to synthesize with cloned voice"),
     pitch: float = Query(1.0, description="Pitch", ge=0.5, le=2.0),
     energy: float = Query(1.0, description="Energy", ge=0.5, le=2.0),
     style: Optional[str] = Query(None, description="Style preset"),
+    session_id: Optional[str] = Query(
+        None, description="Session key to reuse cloned voice"
+    ),
     speaker_wav: UploadFile = File(
         ..., description="Reference speaker WAV (3-15 seconds recommended)"
     ),
 ):
     """
+    Backward-compatible clone endpoint using ElevenLabs voice cloning.
     """
     lang_lower = lang.lower().strip()
+    local_session_id = session_id or uuid.uuid4().hex
+    fallback_voice_id = os.getenv("ELEVENLABS_FALLBACK_VOICE_ID")
     if lang_lower not in XTTS_LANG_MAP:
         supported = ", ".join(sorted(XTTS_LANG_MAP.keys()))
             detail=f"Unsupported clone language: {lang}. Supported: {supported}",
         )
+    voice_id = _voice_session_cache.get(local_session_id)
+    if voice_id is None:
+        audio_bytes = await speaker_wav.read()
+        _validate_audio_upload(speaker_wav, audio_bytes)
+        clone_file = UploadFile(
+            filename=speaker_wav.filename,
+            file=io.BytesIO(audio_bytes),
+            headers=speaker_wav.headers,
         )
+        try:
+            voice_id = await _elevenlabs.clone_voice(clone_file)
+            _voice_session_cache[local_session_id] = voice_id
+        except HTTPException as exc:
+            if fallback_voice_id:
+                voice_id = fallback_voice_id
+            else:
+                raise exc
+    try:
+        audio_bytes = _elevenlabs.generate_speech(
+            text=text,
+            voice_id=voice_id,
+            language=lang_lower,
         )
+    except HTTPException as exc:
+        if fallback_voice_id and voice_id != fallback_voice_id:
+            audio_bytes = _elevenlabs.generate_speech(
+                text=text,
+                voice_id=fallback_voice_id,
+                language=lang_lower,
+            )
+        else:
+            raise exc
+    return Response(
+        content=audio_bytes,
+        media_type="audio/mpeg",
+        headers={
+            "Content-Disposition": "attachment; filename=cloned_output.mp3",
+            "X-Language": lang_lower,
+            "X-Voice": voice_id,
+            "X-Session-Id": local_session_id,
+            "X-Provider": "elevenlabs",
+        },
+    )
 @app.get("/synthesize/get")

src/elevenlabs_service.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import logging
+import os
+import uuid
+from typing import Optional
+import httpx
+from fastapi import HTTPException, UploadFile
+logger = logging.getLogger(__name__)
+class ElevenLabsService:
+    def __init__(self) -> None:
+        self.api_key = os.getenv("ELEVENLABS_API_KEY")
+        self.base_url = os.getenv("ELEVENLABS_BASE_URL", "https://api.elevenlabs.io/v1")
+        self.model_id = os.getenv("ELEVENLABS_MODEL_ID", "eleven_multilingual_v2")
+        self.timeout = float(os.getenv("ELEVENLABS_TIMEOUT_SECONDS", "40"))
+    def _headers(self) -> dict:
+        if not self.api_key:
+            raise HTTPException(
+                status_code=500,
+                detail="ELEVENLABS_API_KEY is not configured",
+            )
+        return {"xi-api-key": self.api_key}
+    async def clone_voice(self, audio_file: UploadFile) -> str:
+        """Clone a voice in ElevenLabs and return the generated voice_id."""
+        if audio_file is None:
+            raise HTTPException(status_code=400, detail="speaker_wav is required")
+        file_bytes = await audio_file.read()
+        if not file_bytes:
+            raise HTTPException(status_code=400, detail="speaker_wav is empty")
+        voice_name = f"voiceapi-temp-{uuid.uuid4().hex[:10]}"
+        files = {
+            "files": (
+                audio_file.filename or "sample.wav",
+                file_bytes,
+                audio_file.content_type or "audio/wav",
+            )
+        }
+        data = {
+            "name": voice_name,
+            "description": "Temporary cloned voice from VoiceAPI session",
+        }
+        try:
+            with httpx.Client(timeout=self.timeout) as client:
+                response = client.post(
+                    f"{self.base_url}/voices/add",
+                    headers=self._headers(),
+                    data=data,
+                    files=files,
+                )
+            if response.status_code >= 400:
+                logger.error("ElevenLabs clone failed: %s", response.text)
+                raise HTTPException(
+                    status_code=502,
+                    detail=f"Voice cloning failed: {response.text[:300]}",
+                )
+            payload = response.json()
+            voice_id = payload.get("voice_id")
+            if not voice_id:
+                raise HTTPException(
+                    status_code=502, detail="voice_id missing in clone response"
+                )
+            return voice_id
+        except httpx.TimeoutException:
+            raise HTTPException(status_code=504, detail="Voice cloning timed out")
+        except HTTPException:
+            raise
+        except Exception as exc:
+            logger.exception("Unexpected clone error")
+            raise HTTPException(status_code=500, detail=f"Clone request failed: {exc}")
+    def generate_speech(
+        self,
+        text: str,
+        voice_id: str,
+        language: Optional[str] = None,
+        output_format: str = "mp3_44100_128",
+    ) -> bytes:
+        """Generate speech bytes using ElevenLabs text-to-speech API."""
+        if not text.strip():
+            raise HTTPException(status_code=400, detail="text is required")
+        if not voice_id:
+            raise HTTPException(status_code=400, detail="voice_id is required")
+        body = {
+            "text": text,
+            "model_id": self.model_id,
+            "voice_settings": {
+                "stability": 0.5,
+                "similarity_boost": 0.75,
+            },
+        }
+        if language:
+            body["language_code"] = language
+        try:
+            with httpx.Client(timeout=self.timeout) as client:
+                response = client.post(
+                    f"{self.base_url}/text-to-speech/{voice_id}",
+                    params={"output_format": output_format},
+                    headers={**self._headers(), "Accept": "audio/mpeg"},
+                    json=body,
+                )
+            if response.status_code >= 400:
+                logger.error("ElevenLabs TTS failed: %s", response.text)
+                raise HTTPException(
+                    status_code=502,
+                    detail=f"Speech generation failed: {response.text[:300]}",
+                )
+            return response.content
+        except httpx.TimeoutException:
+            raise HTTPException(status_code=504, detail="Speech generation timed out")
+        except HTTPException:
+            raise
+        except Exception as exc:
+            logger.exception("Unexpected speech generation error")
+            raise HTTPException(
+                status_code=500, detail=f"Speech generation request failed: {exc}"
+            )