Voice Scribe mirror whisper from FluidInference/whisper-large-v3-turbo-int4-ov-npu@2e8fe5d17cf3

9e40266 verified about 1 month ago

5.96 kB

	---
	language:
	- en
	- zh
	- de
	- es
	- ru
	- ko
	- fr
	- ja
	- pt
	- tr
	- pl
	- ca
	- nl
	- ar
	- sv
	- it
	- id
	- hi
	- fi
	- vi
	- he
	- uk
	- el
	- ms
	- cs
	- ro
	- da
	- hu
	- ta
	- 'no'
	- th
	- ur
	- hr
	- bg
	- lt
	- la
	- mi
	- ml
	- cy
	- sk
	- te
	- fa
	- lv
	- bn
	- sr
	- az
	- sl
	- kn
	- et
	- mk
	- br
	- eu
	- is
	- hy
	- ne
	- mn
	- bs
	- kk
	- sq
	- sw
	- gl
	- mr
	- pa
	- si
	- km
	- sn
	- yo
	- so
	- af
	- oc
	- ka
	- be
	- tg
	- sd
	- gu
	- am
	- yi
	- lo
	- uz
	- fo
	- ht
	- ps
	- tk
	- nn
	- mt
	- sa
	- lb
	- my
	- bo
	- tl
	- mg
	- as
	- tt
	- haw
	- ln
	- ha
	- ba
	- jw
	- su
	tags:
	- audio
	- automatic-speech-recognition
	- hf-asr-leaderboard
	pipeline_tag: automatic-speech-recognition
	license: apache-2.0
	license_link: https://choosealicense.com/licenses/apache-2.0/
	base_model:
	- openai/whisper-large-v3-turbo
	---

	# whisper-large-v3-fp16-ov
	* Model creator: [OpenAI](https://huggingface.co/openai)
	* Original model: [whisper-large-v3](https://huggingface.co/openai/whisper-large-v3)

	## Description
	This is [whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) model converted to the [OpenVINO™ IR](https://docs.openvino.ai/2025/documentation/openvino-ir-format.html) (Intermediate Representation) format with weights compressed to FP16.

	## Compatibility

	The provided OpenVINO™ IR model is compatible with:

	* OpenVINO version 2025.2.0 and higher
	* Optimum Intel 1.23.0 and higher


	```bash
	optimum-cli export openvino --trust-remote-code --model openai/whisper-large-v3-turbo --weight-format int4 --disable-stateful whisper-large-v3-turbo-int4-ov
	```


	```python
	#!/usr/bin/env python3
	import time
	import requests
	import openvino_genai
	import librosa
	from pathlib import Path
	from huggingface_hub import snapshot_download


	def download_model(model_id="FluidInference/whisper-large-v3-turbo-int4-ov-npu"):
	"""Download model from HuggingFace Hub"""
	local_dir = Path("models") / model_id.split("/")[-1]

	if local_dir.exists() and any(local_dir.iterdir()):
	return str(local_dir)

	print(f"Downloading model...")
	snapshot_download(
	repo_id=model_id,
	local_dir=str(local_dir),
	local_dir_use_symlinks=False
	)
	return str(local_dir)


	def download_hf_audio_samples():
	"""Download audio samples from Hugging Face"""
	samples_dir = Path("sample_audios")
	samples_dir.mkdir(exist_ok=True)

	downloaded = []
	whisper_samples = [
	("https://cdn-media.huggingface.co/speech_samples/sample1.flac", "sample1.flac"),
	("https://cdn-media.huggingface.co/speech_samples/sample2.flac", "sample2.flac"),
	]

	for url, filename in whisper_samples:
	filepath = samples_dir / filename
	if filepath.exists():
	downloaded.append(str(filepath))
	continue

	try:
	response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
	response.raise_for_status()

	with open(filepath, 'wb') as f:
	f.write(response.content)

	downloaded.append(str(filepath))
	except Exception as e:
	print(f"Error downloading {filename}: {e}")

	return downloaded


	def read_audio(filepath):
	"""Read audio file and convert to 16kHz"""
	try:
	raw_speech, _ = librosa.load(filepath, sr=16000)
	return raw_speech.tolist()
	except Exception as e:
	print(f"Error reading {filepath}: {e}")
	return None


	def test_whisper_on_file(pipe, filepath):
	"""Test Whisper on a single audio file"""
	config = pipe.get_generation_config()
	config.language = "<\|en\|>"
	config.task = "transcribe"
	config.return_timestamps = True
	config.max_new_tokens = 448

	raw_speech = read_audio(filepath)
	if raw_speech is None:
	return None

	duration = len(raw_speech) / 16000

	start_time = time.time()
	result = pipe.generate(raw_speech, config)
	inference_time = time.time() - start_time

	return {
	"file": filepath,
	"duration": duration,
	"inference_time": inference_time,
	"rtf": inference_time/duration,
	"transcription": str(result)
	}


	def main():
	# Download model
	model_path = download_model()

	# Initialize pipeline on NPU
	print(f"\nInitializing NPU...")
	start_time = time.time()
	pipe = openvino_genai.WhisperPipeline(model_path, "NPU")
	init_time = time.time() - start_time

	results = []

	# Collect test files
	test_files = []
	test_files.extend(Path(".").glob("*.wav"))

	if Path("samples/c/whisper_speech_recognition").exists():
	test_files.extend(Path("samples/c/whisper_speech_recognition").glob("*.wav"))

	# Download HF samples
	hf_samples = download_hf_audio_samples()
	test_files.extend([Path(f) for f in hf_samples])

	# Test all files
	print(f"\nTesting {len(test_files)} files...")
	for audio_file in test_files:
	result = test_whisper_on_file(pipe, str(audio_file))
	if result:
	results.append(result)
	print(f"[OK] {Path(result['file']).name}: RTF={result['rtf']:.2f}x")

	# Print summary
	if results:
	total_duration = sum(r["duration"] for r in results)
	total_inference = sum(r["inference_time"] for r in results)
	avg_rtf = total_inference / total_duration

	print(f"\n{'='*50}")
	print(f"NPU Performance Summary")
	print(f"{'='*50}")
	print(f"Model load time: {init_time:.1f}s")
	print(f"Files tested: {len(results)}")
	print(f"Total audio: {total_duration:.1f}s")
	print(f"Total inference: {total_inference:.1f}s")
	print(f"Average RTF: {avg_rtf:.2f}x {'[Faster than real-time]' if avg_rtf < 1 else '[Slower than real-time]'}")

	print(f"\nResults:")
	for r in results:
	trans = r['transcription'].strip()
	if len(trans) > 60:
	trans = trans[:57] + "..."
	print(f"- {Path(r['file']).name}: \"{trans}\"")


	if __name__ == "__main__":
	main()
	```