Duplicated from openai/whisper-large-v2

S-Fry
/

large

Automatic Speech Recognition

hf-asr-leaderboard

Model card Files Files and versions

large / handler.py

S-Fry's picture

Update handler.py

0619e36 almost 3 years ago

history blame contribute delete

1.25 kB

	import torch
	from typing import Dict
	from transformers import pipeline
	from datasets import load_dataset
	from transformers.pipelines.audio_utils import ffmpeg_read

	SAMPLE_RATE=16000
	class EndpointHandler():
	def __init__(self, path=""):
	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	self.pipe = pipeline(
	"automatic-speech-recognition",
	model="openai/whisper-large",
	chunk_length_s=30,
	device=device,
	)

	def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:
	#ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
	#sample = ds[0]["audio"]
	inputs = data.pop("inputs", data)
	audio_nparray = ffmpeg_read(inputs, 16000)
	audio_tensor = torch.from_numpy(audio_nparray)

	prediction = self.pipe(audio_nparray, return_timestamps=True)
	return {"text": prediction[0]}

	# we can also return timestamps for the predictions
	#prediction = pipe(sample, return_timestamps=True)["chunks"]
	#[{'text': ' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.',
	# 'timestamp': (0.0, 5.44)}]