NCAkit / modules /video_creator /services /libraries /whisper_client.py
ismdrobiul489's picture
Fix import paths: use relative imports for schemas
ceb77ca
import logging
from pathlib import Path
from faster_whisper import WhisperModel
from typing import List
from ...schemas import Caption
logger = logging.getLogger(__name__)
class WhisperClient:
"""Client for faster-whisper caption generation"""
def __init__(self, model_name: str = "tiny.en", model_dir: Path = None):
"""
Initialize Whisper client
Args:
model_name: Whisper model to use (tiny.en, base.en, medium.en, etc.)
model_dir: Directory to store/load models
"""
self.model_name = model_name
self.model_dir = str(model_dir) if model_dir else None
logger.info(f"Loading Whisper model: {model_name}")
# Use CPU with int8 quantization for efficiency
self.model = WhisperModel(
model_name,
device="cpu",
compute_type="int8",
download_root=self.model_dir
)
logger.info("Whisper model loaded successfully")
def create_captions(self, audio_path: str) -> List[Caption]:
"""
Generate captions from audio file
Args:
audio_path: Path to audio file (WAV format preferred)
Returns:
List of Caption objects with text and timing
"""
logger.debug(f"Transcribing audio: {audio_path}")
# Transcribe with word-level timestamps
segments, info = self.model.transcribe(
audio_path,
word_timestamps=True,
vad_filter=True, # Voice activity detection to filter silence
vad_parameters=dict(min_silence_duration_ms=500)
)
captions: List[Caption] = []
for segment in segments:
if not segment.words:
continue
for word in segment.words:
# Skip special tokens
if word.word.startswith('[') or word.word.strip() == '':
continue
# Merge with previous caption if no space and previous doesn't end with space
if (captions and
not word.word.startswith(' ') and
not captions[-1].text.endswith(' ')):
captions[-1].text += word.word.strip()
captions[-1].endMs = int(word.end * 1000)
else:
captions.append(Caption(
text=word.word.strip(),
startMs=int(word.start * 1000),
endMs=int(word.end * 1000)
))
logger.debug(f"Generated {len(captions)} captions from {audio_path}")
return captions