"""Voice note transcription for messaging platforms. Supports: - Local Whisper (cpu/cuda): Hugging Face transformers pipeline - NVIDIA NIM: NVIDIA NIM Whisper/Parakeet """ import os from pathlib import Path from typing import Any from loguru import logger from config.settings import get_settings # Max file size in bytes (25 MB) MAX_AUDIO_SIZE_BYTES = 25 * 1024 * 1024 # NVIDIA NIM Whisper model mapping: (function_id, language_code) _NIM_MODEL_MAP: dict[str, tuple[str, str]] = { "nvidia/parakeet-ctc-0.6b-zh-tw": ("8473f56d-51ef-473c-bb26-efd4f5def2bf", "zh-TW"), "nvidia/parakeet-ctc-0.6b-zh-cn": ("9add5ef7-322e-47e0-ad7a-5653fb8d259b", "zh-CN"), "nvidia/parakeet-ctc-0.6b-es": ("None", "es-US"), "nvidia/parakeet-ctc-0.6b-vi": ("f3dff2bb-99f9-403d-a5f1-f574a757deb0", "vi-VN"), "nvidia/parakeet-ctc-1.1b-asr": ("1598d209-5e27-4d3c-8079-4751568b1081", "en-US"), "nvidia/parakeet-ctc-0.6b-asr": ("d8dd4e9b-fbf5-4fb0-9dba-8cf436c8d965", "en-US"), "nvidia/parakeet-1.1b-rnnt-multilingual-asr": ( "71203149-d3b7-4460-8231-1be2543a1fca", "", ), "openai/whisper-large-v3": ("b702f636-f60c-4a3d-a6f4-f3568c13bd7d", "multi"), } # Short model names -> full Hugging Face model IDs (for local Whisper) _MODEL_MAP: dict[str, str] = { "tiny": "openai/whisper-tiny", "base": "openai/whisper-base", "small": "openai/whisper-small", "medium": "openai/whisper-medium", "large-v2": "openai/whisper-large-v2", "large-v3": "openai/whisper-large-v3", "large-v3-turbo": "openai/whisper-large-v3-turbo", } # Lazy-loaded pipelines: (model_id, device) -> pipeline _pipeline_cache: dict[tuple[str, str], Any] = {} def _resolve_model_id(whisper_model: str) -> str: """Resolve short name to full Hugging Face model ID.""" return _MODEL_MAP.get(whisper_model, whisper_model) def _get_pipeline(model_id: str, device: str) -> Any: """Lazy-load transformers Whisper pipeline. Raises ImportError if not installed.""" global _pipeline_cache if device not in ("cpu", "cuda"): raise ValueError(f"whisper_device must be 'cpu' or 'cuda', got {device!r}") cache_key = (model_id, device) if cache_key not in _pipeline_cache: try: import torch from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline token = get_settings().hf_token if token: os.environ["HF_TOKEN"] = token use_cuda = device == "cuda" and torch.cuda.is_available() pipe_device = "cuda:0" if use_cuda else "cpu" model_dtype = torch.float16 if use_cuda else torch.float32 model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, dtype=model_dtype, low_cpu_mem_usage=True, attn_implementation="sdpa", ) model = model.to(pipe_device) processor = AutoProcessor.from_pretrained(model_id) pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, device=pipe_device, ) _pipeline_cache[cache_key] = pipe logger.debug( f"Loaded Whisper pipeline: model={model_id} device={pipe_device}" ) except ImportError as e: raise ImportError( "Local Whisper requires the voice_local extra. Install with: uv sync --extra voice_local" ) from e return _pipeline_cache[cache_key] def transcribe_audio( file_path: Path, mime_type: str, *, whisper_model: str = "base", whisper_device: str = "cpu", ) -> str: """ Transcribe audio file to text. Supports: - whisper_device="cpu"/"cuda": local Whisper (requires voice_local extra) - whisper_device="nvidia_nim": NVIDIA NIM Whisper API (requires voice extra) Args: file_path: Path to audio file (OGG, MP3, MP4, WAV, M4A supported) mime_type: MIME type of the audio (e.g. "audio/ogg") whisper_model: Model ID or short name (local) or NVIDIA NIM model whisper_device: "cpu" | "cuda" | "nvidia_nim" (defaults to WHISPER_DEVICE env var) Returns: Transcribed text Raises: FileNotFoundError: If file does not exist ValueError: If file too large ImportError: If voice_local extra not installed (for local Whisper) """ if not file_path.exists(): raise FileNotFoundError(f"Audio file not found: {file_path}") size = file_path.stat().st_size if size > MAX_AUDIO_SIZE_BYTES: raise ValueError( f"Audio file too large ({size} bytes). Max {MAX_AUDIO_SIZE_BYTES} bytes." ) if whisper_device == "nvidia_nim": return _transcribe_nim(file_path, whisper_model) else: return _transcribe_local(file_path, whisper_model, whisper_device) # Whisper expects 16 kHz sample rate _WHISPER_SAMPLE_RATE = 16000 def _load_audio(file_path: Path) -> dict[str, Any]: """Load audio file to waveform dict. No ffmpeg required.""" import librosa waveform, sr = librosa.load(str(file_path), sr=_WHISPER_SAMPLE_RATE, mono=True) return {"array": waveform, "sampling_rate": sr} def _transcribe_local(file_path: Path, whisper_model: str, whisper_device: str) -> str: """Transcribe using transformers Whisper pipeline.""" model_id = _resolve_model_id(whisper_model) pipe = _get_pipeline(model_id, whisper_device) audio = _load_audio(file_path) result = pipe(audio, generate_kwargs={"language": "en", "task": "transcribe"}) text = result.get("text", "") or "" if isinstance(text, list): text = " ".join(text) if text else "" result_text = text.strip() logger.debug(f"Local transcription: {len(result_text)} chars") return result_text or "(no speech detected)" def _transcribe_nim(file_path: Path, model: str) -> str: """Transcribe using NVIDIA NIM Whisper API via Riva gRPC client.""" try: import riva.client except ImportError as e: raise ImportError( "NVIDIA NIM transcription requires the voice extra. " "Install with: uv sync --extra voice" ) from e settings = get_settings() api_key = settings.nvidia_nim_api_key # Look up function ID and language code from model mapping model_config = _NIM_MODEL_MAP.get(model) if not model_config: raise ValueError( f"No NVIDIA NIM config found for model: {model}. " f"Supported models: {', '.join(_NIM_MODEL_MAP.keys())}" ) function_id, language_code = model_config # Riva server configuration server = "grpc.nvcf.nvidia.com:443" # Auth with SSL and metadata auth = riva.client.Auth( use_ssl=True, uri=server, metadata_args=[ ["function-id", function_id], ["authorization", f"Bearer {api_key}"], ], ) asr_service = riva.client.ASRService(auth) # Configure recognition - language_code from model config config = riva.client.RecognitionConfig( language_code=language_code, max_alternatives=1, verbatim_transcripts=True, ) # Read audio file with open(file_path, "rb") as f: data = f.read() # Perform offline recognition response = asr_service.offline_recognize(data, config) # Extract text from response - use getattr for safe attribute access transcript = "" results = getattr(response, "results", None) if results and results[0].alternatives: transcript = results[0].alternatives[0].transcript logger.debug(f"NIM transcription: {len(transcript)} chars") return transcript or "(no speech detected)"