| --- |
| language: |
| - en |
| - zh |
| - de |
| - es |
| - ru |
| - ko |
| - fr |
| - ja |
| - pt |
| - tr |
| - pl |
| - ca |
| - nl |
| - ar |
| - sv |
| - it |
| - id |
| - hi |
| - fi |
| - vi |
| - he |
| - uk |
| - el |
| - ms |
| - cs |
| - ro |
| - da |
| - hu |
| - ta |
| - 'no' |
| - th |
| - ur |
| - hr |
| - bg |
| - lt |
| - la |
| - mi |
| - ml |
| - cy |
| - sk |
| - te |
| - fa |
| - lv |
| - bn |
| - sr |
| - az |
| - sl |
| - kn |
| - et |
| - mk |
| - br |
| - eu |
| - is |
| - hy |
| - ne |
| - mn |
| - bs |
| - kk |
| - sq |
| - sw |
| - gl |
| - mr |
| - pa |
| - si |
| - km |
| - sn |
| - yo |
| - so |
| - af |
| - oc |
| - ka |
| - be |
| - tg |
| - sd |
| - gu |
| - am |
| - yi |
| - lo |
| - uz |
| - fo |
| - ht |
| - ps |
| - tk |
| - nn |
| - mt |
| - sa |
| - lb |
| - my |
| - bo |
| - tl |
| - mg |
| - as |
| - tt |
| - haw |
| - ln |
| - ha |
| - ba |
| - jw |
| - su |
| tags: |
| - audio |
| - automatic-speech-recognition |
| - hf-asr-leaderboard |
| pipeline_tag: automatic-speech-recognition |
| license: apache-2.0 |
| license_link: https://choosealicense.com/licenses/apache-2.0/ |
| base_model: |
| - openai/whisper-large-v3-turbo |
| --- |
| |
| # whisper-large-v3-fp16-ov |
| * Model creator: [OpenAI](https://huggingface.co/openai) |
| * Original model: [whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) |
|
|
| ## Description |
| This is [whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) model converted to the [OpenVINO™ IR](https://docs.openvino.ai/2025/documentation/openvino-ir-format.html) (Intermediate Representation) format with weights compressed to FP16. |
|
|
| ## Compatibility |
|
|
| The provided OpenVINO™ IR model is compatible with: |
|
|
| * OpenVINO version 2025.2.0 and higher |
| * Optimum Intel 1.23.0 and higher |
|
|
|
|
| ```bash |
| optimum-cli export openvino --trust-remote-code --model openai/whisper-large-v3-turbo --weight-format int4 --disable-stateful whisper-large-v3-turbo-int4-ov |
| ``` |
|
|
|
|
| ```python |
| #!/usr/bin/env python3 |
| import time |
| import requests |
| import openvino_genai |
| import librosa |
| from pathlib import Path |
| from huggingface_hub import snapshot_download |
| |
| |
| def download_model(model_id="FluidInference/whisper-large-v3-turbo-int4-ov-npu"): |
| """Download model from HuggingFace Hub""" |
| local_dir = Path("models") / model_id.split("/")[-1] |
| |
| if local_dir.exists() and any(local_dir.iterdir()): |
| return str(local_dir) |
| |
| print(f"Downloading model...") |
| snapshot_download( |
| repo_id=model_id, |
| local_dir=str(local_dir), |
| local_dir_use_symlinks=False |
| ) |
| return str(local_dir) |
| |
| |
| def download_hf_audio_samples(): |
| """Download audio samples from Hugging Face""" |
| samples_dir = Path("sample_audios") |
| samples_dir.mkdir(exist_ok=True) |
| |
| downloaded = [] |
| whisper_samples = [ |
| ("https://cdn-media.huggingface.co/speech_samples/sample1.flac", "sample1.flac"), |
| ("https://cdn-media.huggingface.co/speech_samples/sample2.flac", "sample2.flac"), |
| ] |
| |
| for url, filename in whisper_samples: |
| filepath = samples_dir / filename |
| if filepath.exists(): |
| downloaded.append(str(filepath)) |
| continue |
| |
| try: |
| response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}) |
| response.raise_for_status() |
| |
| with open(filepath, 'wb') as f: |
| f.write(response.content) |
| |
| downloaded.append(str(filepath)) |
| except Exception as e: |
| print(f"Error downloading {filename}: {e}") |
| |
| return downloaded |
| |
| |
| def read_audio(filepath): |
| """Read audio file and convert to 16kHz""" |
| try: |
| raw_speech, _ = librosa.load(filepath, sr=16000) |
| return raw_speech.tolist() |
| except Exception as e: |
| print(f"Error reading {filepath}: {e}") |
| return None |
| |
| |
| def test_whisper_on_file(pipe, filepath): |
| """Test Whisper on a single audio file""" |
| config = pipe.get_generation_config() |
| config.language = "<|en|>" |
| config.task = "transcribe" |
| config.return_timestamps = True |
| config.max_new_tokens = 448 |
| |
| raw_speech = read_audio(filepath) |
| if raw_speech is None: |
| return None |
| |
| duration = len(raw_speech) / 16000 |
| |
| start_time = time.time() |
| result = pipe.generate(raw_speech, config) |
| inference_time = time.time() - start_time |
| |
| return { |
| "file": filepath, |
| "duration": duration, |
| "inference_time": inference_time, |
| "rtf": inference_time/duration, |
| "transcription": str(result) |
| } |
| |
| |
| def main(): |
| # Download model |
| model_path = download_model() |
| |
| # Initialize pipeline on NPU |
| print(f"\nInitializing NPU...") |
| start_time = time.time() |
| pipe = openvino_genai.WhisperPipeline(model_path, "NPU") |
| init_time = time.time() - start_time |
| |
| results = [] |
| |
| # Collect test files |
| test_files = [] |
| test_files.extend(Path(".").glob("*.wav")) |
| |
| if Path("samples/c/whisper_speech_recognition").exists(): |
| test_files.extend(Path("samples/c/whisper_speech_recognition").glob("*.wav")) |
| |
| # Download HF samples |
| hf_samples = download_hf_audio_samples() |
| test_files.extend([Path(f) for f in hf_samples]) |
| |
| # Test all files |
| print(f"\nTesting {len(test_files)} files...") |
| for audio_file in test_files: |
| result = test_whisper_on_file(pipe, str(audio_file)) |
| if result: |
| results.append(result) |
| print(f"[OK] {Path(result['file']).name}: RTF={result['rtf']:.2f}x") |
| |
| # Print summary |
| if results: |
| total_duration = sum(r["duration"] for r in results) |
| total_inference = sum(r["inference_time"] for r in results) |
| avg_rtf = total_inference / total_duration |
| |
| print(f"\n{'='*50}") |
| print(f"NPU Performance Summary") |
| print(f"{'='*50}") |
| print(f"Model load time: {init_time:.1f}s") |
| print(f"Files tested: {len(results)}") |
| print(f"Total audio: {total_duration:.1f}s") |
| print(f"Total inference: {total_inference:.1f}s") |
| print(f"Average RTF: {avg_rtf:.2f}x {'[Faster than real-time]' if avg_rtf < 1 else '[Slower than real-time]'}") |
| |
| print(f"\nResults:") |
| for r in results: |
| trans = r['transcription'].strip() |
| if len(trans) > 60: |
| trans = trans[:57] + "..." |
| print(f"- {Path(r['file']).name}: \"{trans}\"") |
| |
| |
| if __name__ == "__main__": |
| main() |
| ``` |