| import torch |
| from typing import Dict |
| from transformers import pipeline |
| from datasets import load_dataset |
| from transformers.pipelines.audio_utils import ffmpeg_read |
|
|
| SAMPLE_RATE=16000 |
| class EndpointHandler(): |
| def __init__(self, path=""): |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" |
| self.pipe = pipeline( |
| "automatic-speech-recognition", |
| model="openai/whisper-large", |
| chunk_length_s=30, |
| device=device, |
| ) |
| |
| def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]: |
| |
| |
| inputs = data.pop("inputs", data) |
| audio_nparray = ffmpeg_read(inputs, 16000) |
| audio_tensor = torch.from_numpy(audio_nparray) |
| |
| prediction = self.pipe(audio_nparray, return_timestamps=True) |
| return {"text": prediction[0]} |
| |
| |
| |
| |
| |