| import numpy as np |
| from src.utils import get_device |
|
|
|
|
| def get_transcriber(model="openai/whisper-base.en", use_gpu=True, gpu_id='auto'): |
| if gpu_id == 'auto': |
| gpu_id = 0 |
| device = get_device() |
| if device == 'cpu' or not use_gpu: |
| device_map = 'auto' |
| else: |
| device_map = {"": gpu_id} if gpu_id >= 0 else {'': 'cuda'} |
|
|
| from transformers import pipeline |
| transcriber = pipeline("automatic-speech-recognition", model=model, device_map=device_map) |
| return transcriber |
|
|
|
|
| def transcribe(audio_state1, new_chunk, transcriber=None, max_chunks=None, sst_floor=100.0, reject_no_new_text=True, |
| debug=False): |
| if audio_state1[0] is None: |
| audio_state1[0] = '' |
| if audio_state1[2] is None: |
| audio_state1[2] = [] |
| if max_chunks is not None and audio_state1[2] is not None and len(audio_state1[2]) > max_chunks: |
| |
| return audio_state1, audio_state1[1] |
| if audio_state1[3] == 'off': |
| if debug: |
| print("Already ended", flush=True) |
| return audio_state1, audio_state1[1] |
| |
| |
| sr, y = new_chunk |
| if y.shape[0] == 0: |
| avg = 0.0 |
| else: |
| |
| if len(y.shape) > 1: |
| y = np.mean(y, dim=0) |
| avg = np.average(np.abs(y)) |
| if not np.isfinite(avg): |
| avg = 0.0 |
| if avg > sst_floor: |
| if debug: |
| print("Got possible chunk: %s" % avg, flush=True) |
| chunks_new = audio_state1[2] + [y] |
| else: |
| chunks_new = audio_state1[2] |
| if debug: |
| print("Rejected quiet chunk: %s" % avg, flush=True) |
| if chunks_new: |
| stream = np.concatenate(chunks_new) |
| stream = stream.astype(np.float32) |
| max_stream = np.max(np.abs(stream) + 1E-7) |
| stream /= max_stream |
| text = transcriber({"sampling_rate": sr, "raw": stream})["text"] |
|
|
| if audio_state1[2]: |
| try: |
| stream0 = np.concatenate(audio_state1[2]) |
| except: |
| raise |
| stream0 = stream0.astype(np.float32) |
| max_stream0 = np.max(np.abs(stream0) + 1E-7) |
| stream0 /= max_stream0 |
| text_y = transcriber({"sampling_rate": sr, "raw": stream0})["text"] |
| else: |
| text_y = None |
|
|
| if debug: |
| print("y.shape: %s stream.shape: %s text0=%s text=%s text_y=%s" % ( |
| str(y.shape), str(stream.shape), audio_state1[0], text, text_y)) |
| if reject_no_new_text and (text == text_y): |
| if debug: |
| print("Rejected non-textual chunk: %s" % avg, flush=True) |
| |
| else: |
| audio_state1[2] = chunks_new |
| else: |
| text = '' |
| |
|
|
| |
| if audio_state1[0] == text: |
| |
| text = '' |
|
|
| if audio_state1[0] is not None: |
| |
| audio_state1[1] = audio_state1[0] + text |
| return audio_state1, audio_state1[1] |
|
|