| """Speech-to-Text & Text-to-Speech API Endpoints""" |
|
|
| from fastapi import APIRouter, UploadFile, File, HTTPException |
| from fastapi.responses import StreamingResponse |
| import logging |
|
|
| from models.audio import ( |
| STTResponse, |
| TTSRequest, |
| TTSResponse |
| ) |
|
|
| from services.stt_service import transcribe_audio |
| from services.tts_service import text_to_speech |
|
|
| router = APIRouter(prefix="/audio", tags=["Audio"]) |
| logger = logging.getLogger(__name__) |
|
|
|
|
| |
| |
| |
|
|
| @router.post("/speech-to-text", response_model=STTResponse) |
| async def speech_to_text_endpoint(file: UploadFile = File(...)): |
| """ |
| Convert speech to text using openai/whisper-large-v3. |
| |
| - Upload an audio file (wav, mp3, m4a…) |
| - Returns transcribed English text |
| """ |
| try: |
| audio_bytes = await file.read() |
|
|
| result = transcribe_audio(audio_bytes) |
|
|
| response_data = STTResponse( |
| text=result, |
| model_name="openai/whisper-large-v3", |
| language="en", |
| duration_seconds=None |
| ) |
|
|
| logger.info(f"STT completed: {response_data.text[:40]}...") |
| return response_data |
|
|
| except Exception as e: |
| logger.error(f"STT error: {str(e)}") |
| raise HTTPException(status_code=500, detail=f"Speech-to-text failed: {str(e)}") |
|
|
|
|
| |
| |
| |
|
|
| @router.post("/text-to-speech", response_model=TTSResponse) |
| async def text_to_speech_endpoint(request: TTSRequest): |
| """ |
| Convert text to synthesized speech using Bark. |
| Returns streamed audio. |
| """ |
| try: |
| audio_bytes = text_to_speech(request.text) |
|
|
| metadata = TTSResponse( |
| message="Audio generated successfully", |
| audio_format="wav", |
| length_seconds=None, |
| model_name="suno/bark" |
| ) |
|
|
| logger.info(f"TTS generated for text: {request.text[:40]}...") |
|
|
| return StreamingResponse( |
| iter([audio_bytes]), |
| media_type="audio/wav", |
| headers={ |
| "X-Audio-Metadata": metadata.model_dump_json() |
| } |
| ) |
|
|
| except Exception as e: |
| logger.error(f"TTS error: {str(e)}") |
| raise HTTPException(status_code=500, detail=f"Text-to-speech failed: {str(e)}") |
|
|