YAML Metadata Warning:empty or missing yaml metadata in repo card
Check out the documentation for more information.
Whisper Large V3 - Word Intelligibility (Synth Aug Pooled)
Fine-tuned Whisper Large V3 for word-level intelligibility prediction.
Model Description
This model predicts word-level intelligibility scores (0-5 scale) from audio. It uses a finetuned Whisper decoder with a regression head on decoder hidden states.
Technical changes from previous version
- Mean pooled ratings: Trained on mean-pooled rater scores instead of individual rater scores
- Synthetic data augmentation: Training data augmented with word omissions and word substitutions
Evaluation
100 audio samples covering various scenarios (word omissions, word substitutions, etc.):
| Metric | Prod | This Model |
|---|---|---|
| PCC | 0.720 | 0.838 |
| SCC | 0.717 | 0.762 |
| MAE | 0.985 | 0.635 |
Training Details
- Base model: openai/whisper-large-v3
- Encoder: Frozen
- Trainable params: ~637M (decoder) + 344K (head)
Usage
import torch
import torch.nn as nn
from transformers import WhisperForConditionalGeneration, WhisperProcessor
REPO = "boldvoice/whisper-large-v3-word-intelligibility-synth-aug-pooled"
# Load model
model = WhisperForConditionalGeneration.from_pretrained(REPO)
processor = WhisperProcessor.from_pretrained(REPO)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()
# Load regression head
from huggingface_hub import hf_hub_download
head_path = hf_hub_download(REPO, "head.pt")
head_data = torch.load(head_path, map_location=device, weights_only=False)
class DecoderHiddenHead(nn.Module):
def __init__(self, hidden_dim=1280, dropout=0.1):
super().__init__()
self.head = nn.Sequential(
nn.Linear(hidden_dim, 256), nn.ReLU(), nn.Dropout(dropout),
nn.Linear(256, 64), nn.ReLU(), nn.Dropout(dropout),
nn.Linear(64, 1), nn.Sigmoid()
)
def forward(self, x):
return self.head(x).squeeze(-1)
head = DecoderHiddenHead(hidden_dim=head_data["hidden_dim"]).to(device)
head.load_state_dict(head_data["head_state_dict"])
head.eval()
# Tokenizer setup
tokenizer = processor.tokenizer
sot = tokenizer.convert_tokens_to_ids("<|startoftranscript|>")
lang = tokenizer.convert_tokens_to_ids("<|en|>")
transcribe = tokenizer.convert_tokens_to_ids("<|transcribe|>")
notimestamps = tokenizer.convert_tokens_to_ids("<|notimestamps|>")
def tokenize_with_boundaries(words):
all_tokens, spans = [], []
for w in words:
toks = tokenizer.encode(" " + w, add_special_tokens=False)
start = len(all_tokens)
all_tokens.extend(toks)
spans.append((start, len(all_tokens)))
return all_tokens, spans
def predict_word_intelligibility(audio, words):
"""
Predict intelligibility scores for each word.
Args:
audio: numpy array of audio at 16kHz
words: list of words (transcript)
Returns:
list of (word, score) tuples where score is 0-5
"""
inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
input_features = inputs.input_features.to(device)
all_tokens, word_spans = tokenize_with_boundaries(words)
prefix = [sot, lang, transcribe, notimestamps]
full_tokens = prefix + all_tokens + [tokenizer.eos_token_id]
decoder_input_ids = torch.tensor([full_tokens[:-1]], device=device)
with torch.no_grad():
outputs = model(input_features, decoder_input_ids=decoder_input_ids, output_hidden_states=True)
decoder_hidden = outputs.decoder_hidden_states[-1].squeeze(0)
token_scores = head(decoder_hidden)
results = []
prefix_len = len(prefix)
for i, (start, end) in enumerate(word_spans):
adj_start, adj_end = start + prefix_len, end + prefix_len
if adj_end <= len(token_scores):
score = token_scores[adj_start:adj_end].mean().item() * 5.0
results.append((words[i], score))
return results
# Example usage
import torchaudio
audio, sr = torchaudio.load("audio.wav")
if sr != 16000:
audio = torchaudio.functional.resample(audio, sr, 16000)
audio = audio.squeeze().numpy()
words = ["Hello", "world", "how", "are", "you"]
scores = predict_word_intelligibility(audio, words)
for word, score in scores:
print(f"{word}: {score:.2f}")
License
Proprietary - BoldVoice internal use only.
- Downloads last month
- 5,042
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support