import logging from pathlib import Path from faster_whisper import WhisperModel from typing import List from ...schemas import Caption logger = logging.getLogger(__name__) class WhisperClient: """Client for faster-whisper caption generation""" def __init__(self, model_name: str = "tiny.en", model_dir: Path = None): """ Initialize Whisper client Args: model_name: Whisper model to use (tiny.en, base.en, medium.en, etc.) model_dir: Directory to store/load models """ self.model_name = model_name self.model_dir = str(model_dir) if model_dir else None logger.info(f"Loading Whisper model: {model_name}") # Use CPU with int8 quantization for efficiency self.model = WhisperModel( model_name, device="cpu", compute_type="int8", download_root=self.model_dir ) logger.info("Whisper model loaded successfully") def create_captions(self, audio_path: str) -> List[Caption]: """ Generate captions from audio file Args: audio_path: Path to audio file (WAV format preferred) Returns: List of Caption objects with text and timing """ logger.debug(f"Transcribing audio: {audio_path}") # Transcribe with word-level timestamps segments, info = self.model.transcribe( audio_path, word_timestamps=True, vad_filter=True, # Voice activity detection to filter silence vad_parameters=dict(min_silence_duration_ms=500) ) captions: List[Caption] = [] for segment in segments: if not segment.words: continue for word in segment.words: # Skip special tokens if word.word.startswith('[') or word.word.strip() == '': continue # Merge with previous caption if no space and previous doesn't end with space if (captions and not word.word.startswith(' ') and not captions[-1].text.endswith(' ')): captions[-1].text += word.word.strip() captions[-1].endMs = int(word.end * 1000) else: captions.append(Caption( text=word.word.strip(), startMs=int(word.start * 1000), endMs=int(word.end * 1000) )) logger.debug(f"Generated {len(captions)} captions from {audio_path}") return captions