Spaces:

Batuka0901
/

mn_stt

Running on Zero

App Files Files Community

Batuka0901 commited on 20 days ago

Commit

4306377

verified ·

1 Parent(s): 1cd819a

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

README.md +0 -1
app.py +38 -19
requirements.txt +1 -0

README.md CHANGED Viewed

@@ -31,4 +31,3 @@ Space → Settings → Variables and secrets:
 |---|---|---|
 | HF_TOKEN | Secret | Read эрхтэй жетон |
 | MODEL_REPO_ID | Variable | Batuka0901/MN_ASR |
-| BASE_MODEL | Variable | openai/whisper-small |

 |---|---|---|
 | HF_TOKEN | Secret | Read эрхтэй жетон |
 | MODEL_REPO_ID | Variable | Batuka0901/MN_ASR |

app.py CHANGED Viewed

@@ -1,13 +1,14 @@
 import os
 import gradio as gr
 import torch
 from huggingface_hub import login
 from transformers import (
     WhisperFeatureExtractor,
     WhisperForConditionalGeneration,
     WhisperTokenizerFast,
-    pipeline,
 )
@@ -29,21 +30,15 @@ if HF_TOKEN:
     login(token=HF_TOKEN)
 MODEL_REPO = os.getenv("MODEL_REPO_ID", "Batuka0901/MN_ASR")
-BASE_MODEL = os.getenv("BASE_MODEL", "openai/whisper-small")
-DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
-print(f"Loading {MODEL_REPO} on {DEVICE} ...")
 model = WhisperForConditionalGeneration.from_pretrained(MODEL_REPO, token=HF_TOKEN)
 tokenizer = WhisperTokenizerFast.from_pretrained(MODEL_REPO, token=HF_TOKEN)
-feature_extractor = WhisperFeatureExtractor.from_pretrained(BASE_MODEL)
-asr = pipeline(
-    task="automatic-speech-recognition",
-    model=model,
-    tokenizer=tokenizer,
-    feature_extractor=feature_extractor,
-    device=DEVICE,
-)
 print("Model loaded.")
 WAITING = "Төлөв: **Аудио хүлээж байна...**"
@@ -52,13 +47,29 @@ WORKING = "Төлөв: **Танилт хийж байна...**"
 DONE = "Төлөв: **Дууссан.**"
 def transcribe(audio_path):
     if not audio_path:
         return "", WAITING
     try:
-        result = asr(audio_path)
-        text = (result.get("text") or "").strip()
-        return text, DONE
     except Exception as e:
         return "", f"Төлөв: **Алдаа** — {type(e).__name__}: {e}"
@@ -73,6 +84,17 @@ def on_clear():
     return gr.update(interactive=False), WAITING
 CSS = """
 footer { display: none !important; visibility: hidden !important; }
 .gradio-container > .footer { display: none !important; }
@@ -81,10 +103,7 @@ button.api-link, .api-docs, a[href*="/api/"] { display: none !important; }
 with gr.Blocks(title="Speech to Text", css=CSS) as demo:
     with gr.Tab("Speech to Text"):
-        gr.Markdown(
-            "Дуу бичих эсвэл audio файл оруулсанаар хэлсэн текстийг "
-            "гаргаж өгнө. Доорх талбараас audio оруулаад **Илгээх** дарна уу."
-        )
         with gr.Row():
             with gr.Column(scale=1):
                 audio_in = gr.Audio(

 import os
 import gradio as gr
+import librosa
+import spaces
 import torch
 from huggingface_hub import login
 from transformers import (
     WhisperFeatureExtractor,
     WhisperForConditionalGeneration,
     WhisperTokenizerFast,
 )
     login(token=HF_TOKEN)
 MODEL_REPO = os.getenv("MODEL_REPO_ID", "Batuka0901/MN_ASR")
+SAMPLING_RATE = 16000
+print(f"Loading {MODEL_REPO} (CPU at startup) ...")
 model = WhisperForConditionalGeneration.from_pretrained(MODEL_REPO, token=HF_TOKEN)
+model.eval()
 tokenizer = WhisperTokenizerFast.from_pretrained(MODEL_REPO, token=HF_TOKEN)
+feature_extractor = WhisperFeatureExtractor.from_pretrained(MODEL_REPO, token=HF_TOKEN)
 print("Model loaded.")
+_moved_to_cuda = False
 WAITING = "Төлөв: **Аудио хүлээж байна...**"
 DONE = "Төлөв: **Дууссан.**"
+@spaces.GPU(duration=60)
 def transcribe(audio_path):
+    global _moved_to_cuda
     if not audio_path:
         return "", WAITING
     try:
+        if not _moved_to_cuda and torch.cuda.is_available():
+            model.to("cuda")
+            _moved_to_cuda = True
+        device = "cuda" if (_moved_to_cuda and torch.cuda.is_available()) else "cpu"
+        audio, _ = librosa.load(audio_path, sr=SAMPLING_RATE)
+        inputs = feature_extractor(
+            audio, sampling_rate=SAMPLING_RATE, return_tensors="pt"
+        )
+        input_features = inputs.input_features.to(device)
+        with torch.no_grad():
+            predicted_ids = model.generate(
+                input_features, language="mn", task="transcribe"
+            )
+        text = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+        return text.strip(), DONE
     except Exception as e:
         return "", f"Төлөв: **Алдаа** — {type(e).__name__}: {e}"
     return gr.update(interactive=False), WAITING
+INSTRUCTIONS = """
+### Заавар
+1. **Audio оруулна уу** — файл upload хийх эсвэл микрофоноор шууд бичлэг хийнэ
+2. **Илгээх** товчийг дарна — таны хэлсэн үгийг загвар таниж текст болгоно
+3. **Гаралт** хэсгээс таниулсан текстийг авч хуулна
+4. Шинээр оролдох бол **Clear** дараад дахин эхлүүлнэ
+> Сайн тод бичлэг + чимээгүй орчин таних чанарыг сайжруулна.
+"""
 CSS = """
 footer { display: none !important; visibility: hidden !important; }
 .gradio-container > .footer { display: none !important; }
 with gr.Blocks(title="Speech to Text", css=CSS) as demo:
     with gr.Tab("Speech to Text"):
+        gr.Markdown(INSTRUCTIONS)
         with gr.Row():
             with gr.Column(scale=1):
                 audio_in = gr.Audio(

requirements.txt CHANGED Viewed

@@ -6,3 +6,4 @@ gradio==5.49.1
 librosa
 numpy<2.0
 soundfile

 librosa
 numpy<2.0
 soundfile
+spaces