import logging
from pathlib import Path
from faster_whisper import WhisperModel
from typing import List
from ...schemas import Caption

logger = logging.getLogger(__name__)


class WhisperClient:
    """Client for faster-whisper caption generation"""
    
    def __init__(self, model_name: str = "tiny.en", model_dir: Path = None):
        """
        Initialize Whisper client
        
        Args:
            model_name: Whisper model to use (tiny.en, base.en, medium.en, etc.)
            model_dir: Directory to store/load models
        """
        self.model_name = model_name
        self.model_dir = str(model_dir) if model_dir else None
        
        logger.info(f"Loading Whisper model: {model_name}")
        
        # Use CPU with int8 quantization for efficiency
        self.model = WhisperModel(
            model_name,
            device="cpu",
            compute_type="int8",
            download_root=self.model_dir
        )
        
        logger.info("Whisper model loaded successfully")
    
    def create_captions(self, audio_path: str) -> List[Caption]:
        """
        Generate captions from audio file
        
        Args:
            audio_path: Path to audio file (WAV format preferred)
        
        Returns:
            List of Caption objects with text and timing
        """
        logger.debug(f"Transcribing audio: {audio_path}")
        
        # Transcribe with word-level timestamps
        segments, info = self.model.transcribe(
            audio_path,
            word_timestamps=True,
            vad_filter=True,  # Voice activity detection to filter silence
            vad_parameters=dict(min_silence_duration_ms=500)
        )
        
        captions: List[Caption] = []
        
        for segment in segments:
            if not segment.words:
                continue
            
            for word in segment.words:
                # Skip special tokens
                if word.word.startswith('[') or word.word.strip() == '':
                    continue
                
                # Merge with previous caption if no space and previous doesn't end with space
                if (captions and 
                    not word.word.startswith(' ') and 
                    not captions[-1].text.endswith(' ')):
                    captions[-1].text += word.word.strip()
                    captions[-1].endMs = int(word.end * 1000)
                else:
                    captions.append(Caption(
                        text=word.word.strip(),
                        startMs=int(word.start * 1000),
                        endMs=int(word.end * 1000)
                    ))
        
        logger.debug(f"Generated {len(captions)} captions from {audio_path}")
        return captions