Claude_Code / messaging /transcription.py
Jainish1808
Move project files to repository root for Hugging Face Space
bf177ff
"""Voice note transcription for messaging platforms.
Supports:
- Local Whisper (cpu/cuda): Hugging Face transformers pipeline
- NVIDIA NIM: NVIDIA NIM Whisper/Parakeet
"""
import os
from pathlib import Path
from typing import Any
from loguru import logger
from config.settings import get_settings
# Max file size in bytes (25 MB)
MAX_AUDIO_SIZE_BYTES = 25 * 1024 * 1024
# NVIDIA NIM Whisper model mapping: (function_id, language_code)
_NIM_MODEL_MAP: dict[str, tuple[str, str]] = {
"nvidia/parakeet-ctc-0.6b-zh-tw": ("8473f56d-51ef-473c-bb26-efd4f5def2bf", "zh-TW"),
"nvidia/parakeet-ctc-0.6b-zh-cn": ("9add5ef7-322e-47e0-ad7a-5653fb8d259b", "zh-CN"),
"nvidia/parakeet-ctc-0.6b-es": ("None", "es-US"),
"nvidia/parakeet-ctc-0.6b-vi": ("f3dff2bb-99f9-403d-a5f1-f574a757deb0", "vi-VN"),
"nvidia/parakeet-ctc-1.1b-asr": ("1598d209-5e27-4d3c-8079-4751568b1081", "en-US"),
"nvidia/parakeet-ctc-0.6b-asr": ("d8dd4e9b-fbf5-4fb0-9dba-8cf436c8d965", "en-US"),
"nvidia/parakeet-1.1b-rnnt-multilingual-asr": (
"71203149-d3b7-4460-8231-1be2543a1fca",
"",
),
"openai/whisper-large-v3": ("b702f636-f60c-4a3d-a6f4-f3568c13bd7d", "multi"),
}
# Short model names -> full Hugging Face model IDs (for local Whisper)
_MODEL_MAP: dict[str, str] = {
"tiny": "openai/whisper-tiny",
"base": "openai/whisper-base",
"small": "openai/whisper-small",
"medium": "openai/whisper-medium",
"large-v2": "openai/whisper-large-v2",
"large-v3": "openai/whisper-large-v3",
"large-v3-turbo": "openai/whisper-large-v3-turbo",
}
# Lazy-loaded pipelines: (model_id, device) -> pipeline
_pipeline_cache: dict[tuple[str, str], Any] = {}
def _resolve_model_id(whisper_model: str) -> str:
"""Resolve short name to full Hugging Face model ID."""
return _MODEL_MAP.get(whisper_model, whisper_model)
def _get_pipeline(model_id: str, device: str) -> Any:
"""Lazy-load transformers Whisper pipeline. Raises ImportError if not installed."""
global _pipeline_cache
if device not in ("cpu", "cuda"):
raise ValueError(f"whisper_device must be 'cpu' or 'cuda', got {device!r}")
cache_key = (model_id, device)
if cache_key not in _pipeline_cache:
try:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
token = get_settings().hf_token
if token:
os.environ["HF_TOKEN"] = token
use_cuda = device == "cuda" and torch.cuda.is_available()
pipe_device = "cuda:0" if use_cuda else "cpu"
model_dtype = torch.float16 if use_cuda else torch.float32
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id,
dtype=model_dtype,
low_cpu_mem_usage=True,
attn_implementation="sdpa",
)
model = model.to(pipe_device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
device=pipe_device,
)
_pipeline_cache[cache_key] = pipe
logger.debug(
f"Loaded Whisper pipeline: model={model_id} device={pipe_device}"
)
except ImportError as e:
raise ImportError(
"Local Whisper requires the voice_local extra. Install with: uv sync --extra voice_local"
) from e
return _pipeline_cache[cache_key]
def transcribe_audio(
file_path: Path,
mime_type: str,
*,
whisper_model: str = "base",
whisper_device: str = "cpu",
) -> str:
"""
Transcribe audio file to text.
Supports:
- whisper_device="cpu"/"cuda": local Whisper (requires voice_local extra)
- whisper_device="nvidia_nim": NVIDIA NIM Whisper API (requires voice extra)
Args:
file_path: Path to audio file (OGG, MP3, MP4, WAV, M4A supported)
mime_type: MIME type of the audio (e.g. "audio/ogg")
whisper_model: Model ID or short name (local) or NVIDIA NIM model
whisper_device: "cpu" | "cuda" | "nvidia_nim" (defaults to WHISPER_DEVICE env var)
Returns:
Transcribed text
Raises:
FileNotFoundError: If file does not exist
ValueError: If file too large
ImportError: If voice_local extra not installed (for local Whisper)
"""
if not file_path.exists():
raise FileNotFoundError(f"Audio file not found: {file_path}")
size = file_path.stat().st_size
if size > MAX_AUDIO_SIZE_BYTES:
raise ValueError(
f"Audio file too large ({size} bytes). Max {MAX_AUDIO_SIZE_BYTES} bytes."
)
if whisper_device == "nvidia_nim":
return _transcribe_nim(file_path, whisper_model)
else:
return _transcribe_local(file_path, whisper_model, whisper_device)
# Whisper expects 16 kHz sample rate
_WHISPER_SAMPLE_RATE = 16000
def _load_audio(file_path: Path) -> dict[str, Any]:
"""Load audio file to waveform dict. No ffmpeg required."""
import librosa
waveform, sr = librosa.load(str(file_path), sr=_WHISPER_SAMPLE_RATE, mono=True)
return {"array": waveform, "sampling_rate": sr}
def _transcribe_local(file_path: Path, whisper_model: str, whisper_device: str) -> str:
"""Transcribe using transformers Whisper pipeline."""
model_id = _resolve_model_id(whisper_model)
pipe = _get_pipeline(model_id, whisper_device)
audio = _load_audio(file_path)
result = pipe(audio, generate_kwargs={"language": "en", "task": "transcribe"})
text = result.get("text", "") or ""
if isinstance(text, list):
text = " ".join(text) if text else ""
result_text = text.strip()
logger.debug(f"Local transcription: {len(result_text)} chars")
return result_text or "(no speech detected)"
def _transcribe_nim(file_path: Path, model: str) -> str:
"""Transcribe using NVIDIA NIM Whisper API via Riva gRPC client."""
try:
import riva.client
except ImportError as e:
raise ImportError(
"NVIDIA NIM transcription requires the voice extra. "
"Install with: uv sync --extra voice"
) from e
settings = get_settings()
api_key = settings.nvidia_nim_api_key
# Look up function ID and language code from model mapping
model_config = _NIM_MODEL_MAP.get(model)
if not model_config:
raise ValueError(
f"No NVIDIA NIM config found for model: {model}. "
f"Supported models: {', '.join(_NIM_MODEL_MAP.keys())}"
)
function_id, language_code = model_config
# Riva server configuration
server = "grpc.nvcf.nvidia.com:443"
# Auth with SSL and metadata
auth = riva.client.Auth(
use_ssl=True,
uri=server,
metadata_args=[
["function-id", function_id],
["authorization", f"Bearer {api_key}"],
],
)
asr_service = riva.client.ASRService(auth)
# Configure recognition - language_code from model config
config = riva.client.RecognitionConfig(
language_code=language_code,
max_alternatives=1,
verbatim_transcripts=True,
)
# Read audio file
with open(file_path, "rb") as f:
data = f.read()
# Perform offline recognition
response = asr_service.offline_recognize(data, config)
# Extract text from response - use getattr for safe attribute access
transcript = ""
results = getattr(response, "results", None)
if results and results[0].alternatives:
transcript = results[0].alternatives[0].transcript
logger.debug(f"NIM transcription: {len(transcript)} chars")
return transcript or "(no speech detected)"