File size: 5,957 Bytes

9e40266

---
language:
- en
- zh
- de
- es
- ru
- ko
- fr
- ja
- pt
- tr
- pl
- ca
- nl
- ar
- sv
- it
- id
- hi
- fi
- vi
- he
- uk
- el
- ms
- cs
- ro
- da
- hu
- ta
- 'no'
- th
- ur
- hr
- bg
- lt
- la
- mi
- ml
- cy
- sk
- te
- fa
- lv
- bn
- sr
- az
- sl
- kn
- et
- mk
- br
- eu
- is
- hy
- ne
- mn
- bs
- kk
- sq
- sw
- gl
- mr
- pa
- si
- km
- sn
- yo
- so
- af
- oc
- ka
- be
- tg
- sd
- gu
- am
- yi
- lo
- uz
- fo
- ht
- ps
- tk
- nn
- mt
- sa
- lb
- my
- bo
- tl
- mg
- as
- tt
- haw
- ln
- ha
- ba
- jw
- su
tags:
- audio
- automatic-speech-recognition
- hf-asr-leaderboard
pipeline_tag: automatic-speech-recognition
license: apache-2.0
license_link: https://choosealicense.com/licenses/apache-2.0/
base_model:
- openai/whisper-large-v3-turbo
---

# whisper-large-v3-fp16-ov
* Model creator: [OpenAI](https://huggingface.co/openai)
 * Original model: [whisper-large-v3](https://huggingface.co/openai/whisper-large-v3)

## Description
This is [whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) model converted to the [OpenVINO™ IR](https://docs.openvino.ai/2025/documentation/openvino-ir-format.html) (Intermediate Representation) format with weights compressed to FP16.

## Compatibility

The provided OpenVINO™ IR model is compatible with:

* OpenVINO version 2025.2.0 and higher
* Optimum Intel 1.23.0 and higher


```bash
optimum-cli export openvino --trust-remote-code --model openai/whisper-large-v3-turbo --weight-format int4 --disable-stateful whisper-large-v3-turbo-int4-ov
```


```python
#!/usr/bin/env python3
import time
import requests
import openvino_genai
import librosa
from pathlib import Path
from huggingface_hub import snapshot_download


def download_model(model_id="FluidInference/whisper-large-v3-turbo-int4-ov-npu"):
    """Download model from HuggingFace Hub"""
    local_dir = Path("models") / model_id.split("/")[-1]

    if local_dir.exists() and any(local_dir.iterdir()):
        return str(local_dir)

    print(f"Downloading model...")
    snapshot_download(
        repo_id=model_id,
        local_dir=str(local_dir),
        local_dir_use_symlinks=False
    )
    return str(local_dir)


def download_hf_audio_samples():
    """Download audio samples from Hugging Face"""
    samples_dir = Path("sample_audios")
    samples_dir.mkdir(exist_ok=True)

    downloaded = []
    whisper_samples = [
        ("https://cdn-media.huggingface.co/speech_samples/sample1.flac", "sample1.flac"),
        ("https://cdn-media.huggingface.co/speech_samples/sample2.flac", "sample2.flac"),
    ]

    for url, filename in whisper_samples:
        filepath = samples_dir / filename
        if filepath.exists():
            downloaded.append(str(filepath))
            continue

        try:
            response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
            response.raise_for_status()

            with open(filepath, 'wb') as f:
                f.write(response.content)

            downloaded.append(str(filepath))
        except Exception as e:
            print(f"Error downloading {filename}: {e}")

    return downloaded


def read_audio(filepath):
    """Read audio file and convert to 16kHz"""
    try:
        raw_speech, _ = librosa.load(filepath, sr=16000)
        return raw_speech.tolist()
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return None


def test_whisper_on_file(pipe, filepath):
    """Test Whisper on a single audio file"""
    config = pipe.get_generation_config()
    config.language = "<|en|>"
    config.task = "transcribe"
    config.return_timestamps = True
    config.max_new_tokens = 448

    raw_speech = read_audio(filepath)
    if raw_speech is None:
        return None

    duration = len(raw_speech) / 16000

    start_time = time.time()
    result = pipe.generate(raw_speech, config)
    inference_time = time.time() - start_time

    return {
        "file": filepath,
        "duration": duration,
        "inference_time": inference_time,
        "rtf": inference_time/duration,
        "transcription": str(result)
    }


def main():
    # Download model
    model_path = download_model()

    # Initialize pipeline on NPU
    print(f"\nInitializing NPU...")
    start_time = time.time()
    pipe = openvino_genai.WhisperPipeline(model_path, "NPU")
    init_time = time.time() - start_time

    results = []

    # Collect test files
    test_files = []
    test_files.extend(Path(".").glob("*.wav"))

    if Path("samples/c/whisper_speech_recognition").exists():
        test_files.extend(Path("samples/c/whisper_speech_recognition").glob("*.wav"))

    # Download HF samples
    hf_samples = download_hf_audio_samples()
    test_files.extend([Path(f) for f in hf_samples])

    # Test all files
    print(f"\nTesting {len(test_files)} files...")
    for audio_file in test_files:
        result = test_whisper_on_file(pipe, str(audio_file))
        if result:
            results.append(result)
            print(f"[OK] {Path(result['file']).name}: RTF={result['rtf']:.2f}x")

    # Print summary
    if results:
        total_duration = sum(r["duration"] for r in results)
        total_inference = sum(r["inference_time"] for r in results)
        avg_rtf = total_inference / total_duration

        print(f"\n{'='*50}")
        print(f"NPU Performance Summary")
        print(f"{'='*50}")
        print(f"Model load time: {init_time:.1f}s")
        print(f"Files tested: {len(results)}")
        print(f"Total audio: {total_duration:.1f}s")
        print(f"Total inference: {total_inference:.1f}s")
        print(f"Average RTF: {avg_rtf:.2f}x {'[Faster than real-time]' if avg_rtf < 1 else '[Slower than real-time]'}")

        print(f"\nResults:")
        for r in results:
            trans = r['transcription'].strip()
            if len(trans) > 60:
                trans = trans[:57] + "..."
            print(f"- {Path(r['file']).name}: \"{trans}\"")


if __name__ == "__main__":
    main()
```