Spaces:
Running on Zero
Running on Zero
| import os | |
| import gradio as gr | |
| import librosa | |
| import spaces | |
| import torch | |
| from huggingface_hub import login | |
| from transformers import ( | |
| WhisperFeatureExtractor, | |
| WhisperForConditionalGeneration, | |
| WhisperTokenizerFast, | |
| ) | |
| _orig_init = WhisperTokenizerFast.__init__ | |
| def _patched_init(self, *args, **kwargs): | |
| est = kwargs.get("extra_special_tokens") | |
| if isinstance(est, list): | |
| kwargs["extra_special_tokens"] = {t: t for t in est} | |
| return _orig_init(self, *args, **kwargs) | |
| WhisperTokenizerFast.__init__ = _patched_init | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| if HF_TOKEN: | |
| login(token=HF_TOKEN) | |
| MODEL_REPO = os.getenv("MODEL_REPO_ID", "Batuka0901/MN_ASR") | |
| SAMPLING_RATE = 16000 | |
| print(f"Loading {MODEL_REPO} (CPU at startup) ...") | |
| model = WhisperForConditionalGeneration.from_pretrained(MODEL_REPO, token=HF_TOKEN) | |
| model.eval() | |
| tokenizer = WhisperTokenizerFast.from_pretrained(MODEL_REPO, token=HF_TOKEN) | |
| feature_extractor = WhisperFeatureExtractor.from_pretrained(MODEL_REPO, token=HF_TOKEN) | |
| print("Model loaded.") | |
| _moved_to_cuda = False | |
| WAITING = "Төлөв: **Аудио хүлээж байна...**" | |
| READY = "Төлөв: **Илгээхэд бэлэн.**" | |
| WORKING = "Төлөв: **Танилт хийж байна...**" | |
| DONE = "Төлөв: **Дууссан.**" | |
| def transcribe(audio_path): | |
| global _moved_to_cuda | |
| if not audio_path: | |
| return "", WAITING | |
| try: | |
| if not _moved_to_cuda and torch.cuda.is_available(): | |
| model.to("cuda") | |
| _moved_to_cuda = True | |
| device = "cuda" if (_moved_to_cuda and torch.cuda.is_available()) else "cpu" | |
| audio, _ = librosa.load(audio_path, sr=SAMPLING_RATE) | |
| inputs = feature_extractor( | |
| audio, sampling_rate=SAMPLING_RATE, return_tensors="pt" | |
| ) | |
| input_features = inputs.input_features.to(device) | |
| with torch.no_grad(): | |
| predicted_ids = model.generate( | |
| input_features, language="mn", task="transcribe" | |
| ) | |
| text = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)[0] | |
| return text.strip(), DONE | |
| except Exception as e: | |
| return "", f"Төлөв: **Алдаа** — {type(e).__name__}: {e}" | |
| def on_audio_change(audio_path): | |
| if audio_path: | |
| return gr.update(interactive=True), READY | |
| return gr.update(interactive=False), WAITING | |
| def on_clear(): | |
| return gr.update(interactive=False), WAITING | |
| INSTRUCTIONS = """ | |
| ### Заавар | |
| 1. **Audio оруулна уу** — файл upload хийх эсвэл микрофоноор шууд бичлэг хийнэ | |
| 2. **Generate** товчийг дарна — таны хэлсэн үгийг загвар таньж текст болгоно | |
| """ | |
| CSS = """ | |
| footer { display: none !important; visibility: hidden !important; } | |
| .gradio-container > .footer { display: none !important; } | |
| button.api-link, .api-docs, a[href*="/api/"] { display: none !important; } | |
| """ | |
| with gr.Blocks(title="Speech to Text", css=CSS) as demo: | |
| with gr.Tab("Speech to Text"): | |
| gr.Markdown(INSTRUCTIONS) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| audio_in = gr.Audio( | |
| sources=["upload", "microphone"], | |
| type="filepath", | |
| label="Audio", | |
| ) | |
| with gr.Row(): | |
| clear_btn = gr.Button("Clear", variant="secondary", size="sm") | |
| submit_btn = gr.Button( | |
| "Generate", variant="primary", size="sm", interactive=False | |
| ) | |
| with gr.Column(scale=1): | |
| text_out = gr.Textbox(label="Гаралт", lines=10) | |
| status = gr.Markdown(WAITING) | |
| audio_in.change(on_audio_change, inputs=audio_in, outputs=[submit_btn, status]) | |
| submit_btn.click(lambda: WORKING, outputs=status).then( | |
| transcribe, inputs=audio_in, outputs=[text_out, status] | |
| ) | |
| clear_btn.click( | |
| lambda: (None, "", WAITING), | |
| outputs=[audio_in, text_out, status], | |
| ).then(on_clear, outputs=[submit_btn, status]) | |
| if __name__ == "__main__": | |
| demo.queue().launch(show_api=False) | |