| from typing import Dict |
| from transformers.pipelines.audio_utils import ffmpeg_read |
| import whisper |
| import torch |
|
|
| SAMPLE_RATE = 16000 |
|
|
|
|
|
|
| class EndpointHandler(): |
| def __init__(self, path=""): |
| |
| self.model = whisper.load_model("large") |
|
|
|
|
| def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]: |
| """ |
| Args: |
| data (:obj:): |
| includes the deserialized audio file as bytes |
| Return: |
| A :obj:`dict`:. base64 encoded image |
| """ |
| |
| inputs = data.pop("inputs", data) |
| audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE) |
| audio_tensor= torch.from_numpy(audio_nparray) |
| |
| |
| result = self.model.transcribe(audio_nparray) |
|
|
| |
| return {"text": result["text"]} |