| import os
|
| if os.environ.get("SPACES_ZERO_GPU") is not None:
|
| import spaces
|
| else:
|
| class spaces:
|
| @staticmethod
|
| def GPU(func):
|
| def wrapper(*args, **kwargs):
|
| return func(*args, **kwargs)
|
| return wrapper
|
| import gradio as gr
|
| import subprocess
|
|
|
|
|
|
|
| import torch
|
| import nemo.collections.asr as nemo_asr
|
|
|
| from pathlib import Path
|
|
|
| model = nemo_asr.models.ASRModel.from_pretrained("ai4bharat/indicconformer_stt_ml_hybrid_rnnt_large")
|
|
|
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| model.freeze()
|
| model = model.to(device)
|
|
|
| @spaces.GPU
|
| def infer(srcfile: str):
|
| tmpfile = "sample_audio_infer_ready.wav"
|
|
|
| subprocess.run(f"ffmpeg -i {srcfile} -ac 1 -ar 16000 {tmpfile}", shell=True)
|
| model.cur_decoder = "ctc"
|
| ctc_text = model.transcribe([tmpfile], batch_size=1, logprobs=False, language_id='ml')[0]
|
| print(ctc_text)
|
|
|
| model.cur_decoder = "rnnt"
|
| rnnt_text = model.transcribe([tmpfile], batch_size=1, language_id='ml')[0]
|
| print(rnnt_text)
|
|
|
| if Path(tmpfile).exists(): Path(tmpfile).unlink()
|
|
|
| return ctc_text, rnnt_text
|
|
|
| with gr.Blocks() as demo:
|
| input_audio = gr.Audio(label="Input", type="filepath", sources=["upload", "microphone"], format="wav")
|
| run_button = gr.Button("Run", variant="primary")
|
| with gr.Row():
|
| ctc_text = gr.Textbox(label="CTC", value="", show_copy_button=True)
|
| rnnt_text = gr.Textbox(label="RNNT", value="", show_copy_button=True)
|
|
|
| run_button.click(infer, [input_audio], [ctc_text, rnnt_text])
|
|
|
| demo.launch()
|
|
|