Spaces:

robiul487
/

NCAkit

Sleeping

App Files Files Community

NCAkit / modules /video_creator /services /libraries /whisper_client.py

ismdrobiul489

Fix import paths: use relative imports for schemas

ceb77ca 5 months ago

raw

history blame contribute delete

2.77 kB

	import logging
	from pathlib import Path
	from faster_whisper import WhisperModel
	from typing import List
	from ...schemas import Caption

	logger = logging.getLogger(__name__)


	class WhisperClient:
	"""Client for faster-whisper caption generation"""

	def __init__(self, model_name: str = "tiny.en", model_dir: Path = None):
	"""
	Initialize Whisper client

	Args:
	model_name: Whisper model to use (tiny.en, base.en, medium.en, etc.)
	model_dir: Directory to store/load models
	"""
	self.model_name = model_name
	self.model_dir = str(model_dir) if model_dir else None

	logger.info(f"Loading Whisper model: {model_name}")

	# Use CPU with int8 quantization for efficiency
	self.model = WhisperModel(
	model_name,
	device="cpu",
	compute_type="int8",
	download_root=self.model_dir
	)

	logger.info("Whisper model loaded successfully")

	def create_captions(self, audio_path: str) -> List[Caption]:
	"""
	Generate captions from audio file

	Args:
	audio_path: Path to audio file (WAV format preferred)

	Returns:
	List of Caption objects with text and timing
	"""
	logger.debug(f"Transcribing audio: {audio_path}")

	# Transcribe with word-level timestamps
	segments, info = self.model.transcribe(
	audio_path,
	word_timestamps=True,
	vad_filter=True, # Voice activity detection to filter silence
	vad_parameters=dict(min_silence_duration_ms=500)
	)

	captions: List[Caption] = []

	for segment in segments:
	if not segment.words:
	continue

	for word in segment.words:
	# Skip special tokens
	if word.word.startswith('[') or word.word.strip() == '':
	continue

	# Merge with previous caption if no space and previous doesn't end with space
	if (captions and
	not word.word.startswith(' ') and
	not captions[-1].text.endswith(' ')):
	captions[-1].text += word.word.strip()
	captions[-1].endMs = int(word.end * 1000)
	else:
	captions.append(Caption(
	text=word.word.strip(),
	startMs=int(word.start * 1000),
	endMs=int(word.end * 1000)
	))

	logger.debug(f"Generated {len(captions)} captions from {audio_path}")
	return captions