| import logging |
| from pathlib import Path |
| from faster_whisper import WhisperModel |
| from typing import List |
| from ...schemas import Caption |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| class WhisperClient: |
| """Client for faster-whisper caption generation""" |
| |
| def __init__(self, model_name: str = "tiny.en", model_dir: Path = None): |
| """ |
| Initialize Whisper client |
| |
| Args: |
| model_name: Whisper model to use (tiny.en, base.en, medium.en, etc.) |
| model_dir: Directory to store/load models |
| """ |
| self.model_name = model_name |
| self.model_dir = str(model_dir) if model_dir else None |
| |
| logger.info(f"Loading Whisper model: {model_name}") |
| |
| |
| self.model = WhisperModel( |
| model_name, |
| device="cpu", |
| compute_type="int8", |
| download_root=self.model_dir |
| ) |
| |
| logger.info("Whisper model loaded successfully") |
| |
| def create_captions(self, audio_path: str) -> List[Caption]: |
| """ |
| Generate captions from audio file |
| |
| Args: |
| audio_path: Path to audio file (WAV format preferred) |
| |
| Returns: |
| List of Caption objects with text and timing |
| """ |
| logger.debug(f"Transcribing audio: {audio_path}") |
| |
| |
| segments, info = self.model.transcribe( |
| audio_path, |
| word_timestamps=True, |
| vad_filter=True, |
| vad_parameters=dict(min_silence_duration_ms=500) |
| ) |
| |
| captions: List[Caption] = [] |
| |
| for segment in segments: |
| if not segment.words: |
| continue |
| |
| for word in segment.words: |
| |
| if word.word.startswith('[') or word.word.strip() == '': |
| continue |
| |
| |
| if (captions and |
| not word.word.startswith(' ') and |
| not captions[-1].text.endswith(' ')): |
| captions[-1].text += word.word.strip() |
| captions[-1].endMs = int(word.end * 1000) |
| else: |
| captions.append(Caption( |
| text=word.word.strip(), |
| startMs=int(word.start * 1000), |
| endMs=int(word.end * 1000) |
| )) |
| |
| logger.debug(f"Generated {len(captions)} captions from {audio_path}") |
| return captions |
|
|