| from pydantic import BaseModel, Field, ConfigDict |
| from typing import Optional |
|
|
| |
| |
| |
| class STTResponse(BaseModel): |
| model_config = ConfigDict( |
| json_schema_extra={ |
| "example": { |
| "text": "hello how are you", |
| "model_name": "openai/whisper-large-v3", |
| "language": "en", |
| "duration_seconds": 3.2 |
| } |
| } |
| ) |
| text: str = Field(..., description="Transcribed text from the input audio") |
| model_name: str = Field(..., description="STT model used for inference") |
| language: Optional[str] = Field(None, description="Detected language") |
| duration_seconds: Optional[float] = Field( |
| None, |
| description="Approximate audio duration in seconds" |
| ) |
|
|
| |
| |
| |
| class TTSRequest(BaseModel): |
| model_config = ConfigDict( |
| json_schema_extra={"example": {"text": "Hello, welcome to our AI system."}} |
| ) |
| text: str = Field(..., min_length=1, max_length=500, description="Text to convert to speech") |
|
|
| class TTSResponse(BaseModel): |
| model_config = ConfigDict( |
| json_schema_extra={ |
| "example": { |
| "message": "Audio generated successfully", |
| "audio_format": "wav", |
| "length_seconds": 2.5, |
| "model_name": "suno/bark" |
| } |
| } |
| ) |
| message: str |
| audio_format: str |
| length_seconds: Optional[float] = None |
| model_name: str |
|
|