Harshil748 commited on
Commit
24d1041
·
1 Parent(s): b9ffa51

Add voice cloning service and update API for voice generation

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. requirements.txt +1 -0
  3. src/api.py +171 -51
  4. src/elevenlabs_service.py +126 -0
.gitignore CHANGED
@@ -32,3 +32,4 @@ Thumbs.db
32
  # Test outputs
33
  *.wav
34
  *.mp3
 
 
32
  # Test outputs
33
  *.wav
34
  *.mp3
35
+ .env.example
requirements.txt CHANGED
@@ -22,6 +22,7 @@ pydantic
22
  # Utilities
23
  tqdm
24
  requests
 
25
 
26
  # Development (optional)
27
  pytest
 
22
  # Utilities
23
  tqdm
24
  requests
25
+ httpx
26
 
27
  # Development (optional)
28
  pytest
src/api.py CHANGED
@@ -11,7 +11,8 @@ import io
11
  import time
12
  import logging
13
  import tempfile
14
- from typing import Optional, List
 
15
  from pathlib import Path
16
  import numpy as np
17
 
@@ -23,6 +24,7 @@ from fastapi import (
23
  BackgroundTasks,
24
  UploadFile,
25
  File,
 
26
  )
27
  from fastapi.middleware.cors import CORSMiddleware
28
  from fastapi.responses import StreamingResponse, FileResponse, JSONResponse
@@ -36,6 +38,7 @@ from .config import (
36
  get_available_voices,
37
  STYLE_PRESETS,
38
  )
 
39
 
40
  # Language mapping for XTTS voice cloning
41
  XTTS_LANG_MAP = {
@@ -90,8 +93,8 @@ app = FastAPI(
90
  """,
91
  version="1.0.0",
92
  contact={
93
- "name": "Voice Tech for All Hackathon",
94
- "url": "https://huggingface.co/SYSPIN",
95
  },
96
  license_info={
97
  "name": "CC BY 4.0",
@@ -110,6 +113,18 @@ app.add_middleware(
110
 
111
  # Initialize TTS Engine (lazy loading)
112
  _engine: Optional[TTSEngine] = None
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
 
115
  def get_engine() -> TTSEngine:
@@ -173,6 +188,32 @@ class CloneResponse(BaseModel):
173
  language: str
174
 
175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  class VoiceInfo(BaseModel):
177
  """Information about a voice"""
178
 
@@ -353,6 +394,85 @@ async def synthesize_stream(request: SynthesizeRequest):
353
  raise HTTPException(status_code=500, detail=str(e))
354
 
355
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
  @app.post("/clone", response_class=Response)
357
  async def clone_voice(
358
  text: str = Query(..., description="Text to synthesize with cloned voice"),
@@ -364,15 +484,19 @@ async def clone_voice(
364
  pitch: float = Query(1.0, description="Pitch", ge=0.5, le=2.0),
365
  energy: float = Query(1.0, description="Energy", ge=0.5, le=2.0),
366
  style: Optional[str] = Query(None, description="Style preset"),
 
 
 
367
  speaker_wav: UploadFile = File(
368
  ..., description="Reference speaker WAV (3-15 seconds recommended)"
369
  ),
370
  ):
371
  """
372
- Clone a custom voice from uploaded sample using XTTS v2.
373
  """
374
- engine = get_engine()
375
  lang_lower = lang.lower().strip()
 
 
376
 
377
  if lang_lower not in XTTS_LANG_MAP:
378
  supported = ", ".join(sorted(XTTS_LANG_MAP.keys()))
@@ -381,55 +505,51 @@ async def clone_voice(
381
  detail=f"Unsupported clone language: {lang}. Supported: {supported}",
382
  )
383
 
384
- temp_path = None
385
- try:
386
- data = await speaker_wav.read()
387
- if len(data) < 44:
388
- raise HTTPException(status_code=400, detail="Invalid speaker_wav file")
389
-
390
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
391
- tmp.write(data)
392
- temp_path = tmp.name
393
-
394
- start_time = time.time()
395
- output = engine.clone_voice(
396
- text=text,
397
- speaker_wav_path=temp_path,
398
- language_code=XTTS_LANG_MAP[lang_lower],
399
- speed=speed,
400
- pitch=pitch,
401
- energy=energy,
402
- style=style,
403
- normalize_text=True,
404
  )
405
- inference_time = time.time() - start_time
406
-
407
- buffer = io.BytesIO()
408
- sf.write(buffer, output.audio, output.sample_rate, format="WAV")
409
- buffer.seek(0)
 
 
 
410
 
411
- return Response(
412
- content=buffer.read(),
413
- media_type="audio/wav",
414
- headers={
415
- "X-Duration": str(output.duration),
416
- "X-Sample-Rate": str(output.sample_rate),
417
- "X-Language": lang_lower,
418
- "X-Voice": "custom_cloned",
419
- "X-Inference-Time": str(inference_time),
420
- },
421
  )
422
- except HTTPException:
423
- raise
424
- except Exception as e:
425
- logger.error(f"Clone error: {e}")
426
- raise HTTPException(status_code=500, detail=str(e))
427
- finally:
428
- if temp_path and os.path.exists(temp_path):
429
- try:
430
- os.remove(temp_path)
431
- except OSError:
432
- pass
 
 
 
 
 
 
 
 
 
 
433
 
434
 
435
  @app.get("/synthesize/get")
 
11
  import time
12
  import logging
13
  import tempfile
14
+ import uuid
15
+ from typing import Optional, List, Dict
16
  from pathlib import Path
17
  import numpy as np
18
 
 
24
  BackgroundTasks,
25
  UploadFile,
26
  File,
27
+ Form,
28
  )
29
  from fastapi.middleware.cors import CORSMiddleware
30
  from fastapi.responses import StreamingResponse, FileResponse, JSONResponse
 
38
  get_available_voices,
39
  STYLE_PRESETS,
40
  )
41
+ from .elevenlabs_service import ElevenLabsService
42
 
43
  # Language mapping for XTTS voice cloning
44
  XTTS_LANG_MAP = {
 
93
  """,
94
  version="1.0.0",
95
  contact={
96
+ "name": "Harshil PAtel",
97
+ "url": "https://harshilpatel.me/#contact",
98
  },
99
  license_info={
100
  "name": "CC BY 4.0",
 
113
 
114
  # Initialize TTS Engine (lazy loading)
115
  _engine: Optional[TTSEngine] = None
116
+ _elevenlabs = ElevenLabsService()
117
+
118
+ # In-memory session voice cache for temporary cloned voice IDs
119
+ _voice_session_cache: Dict[str, str] = {}
120
+
121
+ ALLOWED_AUDIO_TYPES = {
122
+ "audio/wav",
123
+ "audio/x-wav",
124
+ "audio/mpeg",
125
+ "audio/mp3",
126
+ }
127
+ MAX_UPLOAD_BYTES = int(os.getenv("MAX_UPLOAD_BYTES", str(10 * 1024 * 1024)))
128
 
129
 
130
  def get_engine() -> TTSEngine:
 
188
  language: str
189
 
190
 
191
+ def _validate_audio_upload(upload: UploadFile, raw_bytes: bytes) -> None:
192
+ if upload is None:
193
+ raise HTTPException(status_code=400, detail="speaker_wav is required")
194
+
195
+ filename = (upload.filename or "").lower()
196
+ if not filename.endswith((".wav", ".mp3")):
197
+ raise HTTPException(
198
+ status_code=400, detail="Only .wav or .mp3 files are supported"
199
+ )
200
+
201
+ content_type = upload.content_type or ""
202
+ if content_type and content_type not in ALLOWED_AUDIO_TYPES:
203
+ raise HTTPException(
204
+ status_code=400, detail=f"Unsupported content type: {content_type}"
205
+ )
206
+
207
+ if len(raw_bytes) == 0:
208
+ raise HTTPException(status_code=400, detail="Uploaded audio file is empty")
209
+
210
+ if len(raw_bytes) > MAX_UPLOAD_BYTES:
211
+ raise HTTPException(
212
+ status_code=400,
213
+ detail=f"Audio file too large. Max allowed is {MAX_UPLOAD_BYTES // (1024 * 1024)} MB",
214
+ )
215
+
216
+
217
  class VoiceInfo(BaseModel):
218
  """Information about a voice"""
219
 
 
394
  raise HTTPException(status_code=500, detail=str(e))
395
 
396
 
397
+ @app.post("/generate", response_class=StreamingResponse)
398
+ async def generate_with_cloned_voice(
399
+ text: str = Form(...),
400
+ lang: str = Form("english"),
401
+ session_id: Optional[str] = Form(None),
402
+ speaker_wav: Optional[UploadFile] = File(None),
403
+ ):
404
+ """
405
+ Production-ready temporary voice cloning + speech generation using ElevenLabs.
406
+ """
407
+ lang_lower = (lang or "english").lower().strip()
408
+ local_session_id = session_id or uuid.uuid4().hex
409
+ fallback_voice_id = os.getenv("ELEVENLABS_FALLBACK_VOICE_ID")
410
+
411
+ voice_id = _voice_session_cache.get(local_session_id)
412
+
413
+ if voice_id is None:
414
+ if speaker_wav is None:
415
+ raise HTTPException(
416
+ status_code=400,
417
+ detail="speaker_wav is required for first request in a session",
418
+ )
419
+
420
+ audio_bytes = await speaker_wav.read()
421
+ _validate_audio_upload(speaker_wav, audio_bytes)
422
+
423
+ # Re-wrap file for downstream service after validation read
424
+ clone_file = UploadFile(
425
+ filename=speaker_wav.filename,
426
+ file=io.BytesIO(audio_bytes),
427
+ headers=speaker_wav.headers,
428
+ )
429
+
430
+ try:
431
+ voice_id = await _elevenlabs.clone_voice(clone_file)
432
+ _voice_session_cache[local_session_id] = voice_id
433
+ logger.info(
434
+ "Cloned ElevenLabs voice for session=%s voice_id=%s",
435
+ local_session_id,
436
+ voice_id,
437
+ )
438
+ except HTTPException as exc:
439
+ if fallback_voice_id:
440
+ logger.warning(
441
+ "Clone failed for session=%s, using fallback voice",
442
+ local_session_id,
443
+ )
444
+ voice_id = fallback_voice_id
445
+ else:
446
+ raise exc
447
+
448
+ try:
449
+ audio_bytes = _elevenlabs.generate_speech(
450
+ text=text, voice_id=voice_id, language=lang_lower
451
+ )
452
+ except HTTPException as exc:
453
+ if fallback_voice_id and voice_id != fallback_voice_id:
454
+ logger.warning(
455
+ "TTS failed for voice=%s, retrying with fallback voice", voice_id
456
+ )
457
+ audio_bytes = _elevenlabs.generate_speech(
458
+ text=text,
459
+ voice_id=fallback_voice_id,
460
+ language=lang_lower,
461
+ )
462
+ else:
463
+ raise exc
464
+
465
+ headers = {
466
+ "Content-Disposition": "attachment; filename=generated.mp3",
467
+ "X-Session-Id": local_session_id,
468
+ "X-Voice-Id": voice_id,
469
+ "X-Provider": "elevenlabs",
470
+ }
471
+ return StreamingResponse(
472
+ io.BytesIO(audio_bytes), media_type="audio/mpeg", headers=headers
473
+ )
474
+
475
+
476
  @app.post("/clone", response_class=Response)
477
  async def clone_voice(
478
  text: str = Query(..., description="Text to synthesize with cloned voice"),
 
484
  pitch: float = Query(1.0, description="Pitch", ge=0.5, le=2.0),
485
  energy: float = Query(1.0, description="Energy", ge=0.5, le=2.0),
486
  style: Optional[str] = Query(None, description="Style preset"),
487
+ session_id: Optional[str] = Query(
488
+ None, description="Session key to reuse cloned voice"
489
+ ),
490
  speaker_wav: UploadFile = File(
491
  ..., description="Reference speaker WAV (3-15 seconds recommended)"
492
  ),
493
  ):
494
  """
495
+ Backward-compatible clone endpoint using ElevenLabs voice cloning.
496
  """
 
497
  lang_lower = lang.lower().strip()
498
+ local_session_id = session_id or uuid.uuid4().hex
499
+ fallback_voice_id = os.getenv("ELEVENLABS_FALLBACK_VOICE_ID")
500
 
501
  if lang_lower not in XTTS_LANG_MAP:
502
  supported = ", ".join(sorted(XTTS_LANG_MAP.keys()))
 
505
  detail=f"Unsupported clone language: {lang}. Supported: {supported}",
506
  )
507
 
508
+ voice_id = _voice_session_cache.get(local_session_id)
509
+ if voice_id is None:
510
+ audio_bytes = await speaker_wav.read()
511
+ _validate_audio_upload(speaker_wav, audio_bytes)
512
+ clone_file = UploadFile(
513
+ filename=speaker_wav.filename,
514
+ file=io.BytesIO(audio_bytes),
515
+ headers=speaker_wav.headers,
 
 
 
 
 
 
 
 
 
 
 
 
516
  )
517
+ try:
518
+ voice_id = await _elevenlabs.clone_voice(clone_file)
519
+ _voice_session_cache[local_session_id] = voice_id
520
+ except HTTPException as exc:
521
+ if fallback_voice_id:
522
+ voice_id = fallback_voice_id
523
+ else:
524
+ raise exc
525
 
526
+ try:
527
+ audio_bytes = _elevenlabs.generate_speech(
528
+ text=text,
529
+ voice_id=voice_id,
530
+ language=lang_lower,
 
 
 
 
 
531
  )
532
+ except HTTPException as exc:
533
+ if fallback_voice_id and voice_id != fallback_voice_id:
534
+ audio_bytes = _elevenlabs.generate_speech(
535
+ text=text,
536
+ voice_id=fallback_voice_id,
537
+ language=lang_lower,
538
+ )
539
+ else:
540
+ raise exc
541
+
542
+ return Response(
543
+ content=audio_bytes,
544
+ media_type="audio/mpeg",
545
+ headers={
546
+ "Content-Disposition": "attachment; filename=cloned_output.mp3",
547
+ "X-Language": lang_lower,
548
+ "X-Voice": voice_id,
549
+ "X-Session-Id": local_session_id,
550
+ "X-Provider": "elevenlabs",
551
+ },
552
+ )
553
 
554
 
555
  @app.get("/synthesize/get")
src/elevenlabs_service.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import uuid
4
+ from typing import Optional
5
+
6
+ import httpx
7
+ from fastapi import HTTPException, UploadFile
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class ElevenLabsService:
13
+ def __init__(self) -> None:
14
+ self.api_key = os.getenv("ELEVENLABS_API_KEY")
15
+ self.base_url = os.getenv("ELEVENLABS_BASE_URL", "https://api.elevenlabs.io/v1")
16
+ self.model_id = os.getenv("ELEVENLABS_MODEL_ID", "eleven_multilingual_v2")
17
+ self.timeout = float(os.getenv("ELEVENLABS_TIMEOUT_SECONDS", "40"))
18
+
19
+ def _headers(self) -> dict:
20
+ if not self.api_key:
21
+ raise HTTPException(
22
+ status_code=500,
23
+ detail="ELEVENLABS_API_KEY is not configured",
24
+ )
25
+ return {"xi-api-key": self.api_key}
26
+
27
+ async def clone_voice(self, audio_file: UploadFile) -> str:
28
+ """Clone a voice in ElevenLabs and return the generated voice_id."""
29
+ if audio_file is None:
30
+ raise HTTPException(status_code=400, detail="speaker_wav is required")
31
+
32
+ file_bytes = await audio_file.read()
33
+ if not file_bytes:
34
+ raise HTTPException(status_code=400, detail="speaker_wav is empty")
35
+
36
+ voice_name = f"voiceapi-temp-{uuid.uuid4().hex[:10]}"
37
+
38
+ files = {
39
+ "files": (
40
+ audio_file.filename or "sample.wav",
41
+ file_bytes,
42
+ audio_file.content_type or "audio/wav",
43
+ )
44
+ }
45
+ data = {
46
+ "name": voice_name,
47
+ "description": "Temporary cloned voice from VoiceAPI session",
48
+ }
49
+
50
+ try:
51
+ with httpx.Client(timeout=self.timeout) as client:
52
+ response = client.post(
53
+ f"{self.base_url}/voices/add",
54
+ headers=self._headers(),
55
+ data=data,
56
+ files=files,
57
+ )
58
+ if response.status_code >= 400:
59
+ logger.error("ElevenLabs clone failed: %s", response.text)
60
+ raise HTTPException(
61
+ status_code=502,
62
+ detail=f"Voice cloning failed: {response.text[:300]}",
63
+ )
64
+ payload = response.json()
65
+ voice_id = payload.get("voice_id")
66
+ if not voice_id:
67
+ raise HTTPException(
68
+ status_code=502, detail="voice_id missing in clone response"
69
+ )
70
+ return voice_id
71
+ except httpx.TimeoutException:
72
+ raise HTTPException(status_code=504, detail="Voice cloning timed out")
73
+ except HTTPException:
74
+ raise
75
+ except Exception as exc:
76
+ logger.exception("Unexpected clone error")
77
+ raise HTTPException(status_code=500, detail=f"Clone request failed: {exc}")
78
+
79
+ def generate_speech(
80
+ self,
81
+ text: str,
82
+ voice_id: str,
83
+ language: Optional[str] = None,
84
+ output_format: str = "mp3_44100_128",
85
+ ) -> bytes:
86
+ """Generate speech bytes using ElevenLabs text-to-speech API."""
87
+ if not text.strip():
88
+ raise HTTPException(status_code=400, detail="text is required")
89
+ if not voice_id:
90
+ raise HTTPException(status_code=400, detail="voice_id is required")
91
+
92
+ body = {
93
+ "text": text,
94
+ "model_id": self.model_id,
95
+ "voice_settings": {
96
+ "stability": 0.5,
97
+ "similarity_boost": 0.75,
98
+ },
99
+ }
100
+ if language:
101
+ body["language_code"] = language
102
+
103
+ try:
104
+ with httpx.Client(timeout=self.timeout) as client:
105
+ response = client.post(
106
+ f"{self.base_url}/text-to-speech/{voice_id}",
107
+ params={"output_format": output_format},
108
+ headers={**self._headers(), "Accept": "audio/mpeg"},
109
+ json=body,
110
+ )
111
+ if response.status_code >= 400:
112
+ logger.error("ElevenLabs TTS failed: %s", response.text)
113
+ raise HTTPException(
114
+ status_code=502,
115
+ detail=f"Speech generation failed: {response.text[:300]}",
116
+ )
117
+ return response.content
118
+ except httpx.TimeoutException:
119
+ raise HTTPException(status_code=504, detail="Speech generation timed out")
120
+ except HTTPException:
121
+ raise
122
+ except Exception as exc:
123
+ logger.exception("Unexpected speech generation error")
124
+ raise HTTPException(
125
+ status_code=500, detail=f"Speech generation request failed: {exc}"
126
+ )