| """Pydantic schemas for Speech-to-Text and Text-to-Speech endpoints""" |
|
|
| from pydantic import BaseModel, Field, ConfigDict |
| from typing import Optional |
|
|
|
|
| |
| |
| |
|
|
| class STTResponse(BaseModel): |
| """Response model for Whisper speech → text""" |
| model_config = ConfigDict( |
| json_schema_extra={ |
| "example": { |
| "text": "hello how are you", |
| "model_name": "openai/whisper-large-v3", |
| "language": "en", |
| "duration_seconds": 3.2 |
| } |
| } |
| ) |
|
|
| text: str = Field(..., description="Transcribed text from the input audio") |
| model_name: str = Field(..., description="STT model used for inference") |
| language: Optional[str] = Field(None, description="Detected language") |
| duration_seconds: Optional[float] = Field( |
| None, |
| description="Approximate audio duration in seconds" |
| ) |
|
|
|
|
| |
| |
| |
|
|
| class TTSRequest(BaseModel): |
| """Text input for TTS conversion""" |
| model_config = ConfigDict( |
| json_schema_extra={ |
| "example": { |
| "text": "Hello, welcome to our AI system." |
| } |
| } |
| ) |
|
|
| text: str = Field( |
| ..., min_length=1, max_length=500, |
| description="Text that will be converted into speech" |
| ) |
|
|
|
|
| class TTSResponse(BaseModel): |
| """Metadata response for TTS generation""" |
| model_config = ConfigDict( |
| json_schema_extra={ |
| "example": { |
| "message": "Audio generated successfully", |
| "audio_format": "wav", |
| "length_seconds": 2.5, |
| "model_name": "suno/bark" |
| } |
| } |
| ) |
|
|
| message: str |
| audio_format: str |
| length_seconds: Optional[float] = None |
| model_name: str |
|
|