Batuka0901 commited on
Commit
4306377
·
verified ·
1 Parent(s): 1cd819a

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +0 -1
  2. app.py +38 -19
  3. requirements.txt +1 -0
README.md CHANGED
@@ -31,4 +31,3 @@ Space → Settings → Variables and secrets:
31
  |---|---|---|
32
  | HF_TOKEN | Secret | Read эрхтэй жетон |
33
  | MODEL_REPO_ID | Variable | Batuka0901/MN_ASR |
34
- | BASE_MODEL | Variable | openai/whisper-small |
 
31
  |---|---|---|
32
  | HF_TOKEN | Secret | Read эрхтэй жетон |
33
  | MODEL_REPO_ID | Variable | Batuka0901/MN_ASR |
 
app.py CHANGED
@@ -1,13 +1,14 @@
1
  import os
2
 
3
  import gradio as gr
 
 
4
  import torch
5
  from huggingface_hub import login
6
  from transformers import (
7
  WhisperFeatureExtractor,
8
  WhisperForConditionalGeneration,
9
  WhisperTokenizerFast,
10
- pipeline,
11
  )
12
 
13
 
@@ -29,21 +30,15 @@ if HF_TOKEN:
29
  login(token=HF_TOKEN)
30
 
31
  MODEL_REPO = os.getenv("MODEL_REPO_ID", "Batuka0901/MN_ASR")
32
- BASE_MODEL = os.getenv("BASE_MODEL", "openai/whisper-small")
33
- DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
34
 
35
- print(f"Loading {MODEL_REPO} on {DEVICE} ...")
36
  model = WhisperForConditionalGeneration.from_pretrained(MODEL_REPO, token=HF_TOKEN)
 
37
  tokenizer = WhisperTokenizerFast.from_pretrained(MODEL_REPO, token=HF_TOKEN)
38
- feature_extractor = WhisperFeatureExtractor.from_pretrained(BASE_MODEL)
39
- asr = pipeline(
40
- task="automatic-speech-recognition",
41
- model=model,
42
- tokenizer=tokenizer,
43
- feature_extractor=feature_extractor,
44
- device=DEVICE,
45
- )
46
  print("Model loaded.")
 
47
 
48
 
49
  WAITING = "Төлөв: **Аудио хүлээж байна...**"
@@ -52,13 +47,29 @@ WORKING = "Төлөв: **Танилт хийж байна...**"
52
  DONE = "Төлөв: **Дууссан.**"
53
 
54
 
 
55
  def transcribe(audio_path):
 
56
  if not audio_path:
57
  return "", WAITING
58
  try:
59
- result = asr(audio_path)
60
- text = (result.get("text") or "").strip()
61
- return text, DONE
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  except Exception as e:
63
  return "", f"Төлөв: **Алдаа** — {type(e).__name__}: {e}"
64
 
@@ -73,6 +84,17 @@ def on_clear():
73
  return gr.update(interactive=False), WAITING
74
 
75
 
 
 
 
 
 
 
 
 
 
 
 
76
  CSS = """
77
  footer { display: none !important; visibility: hidden !important; }
78
  .gradio-container > .footer { display: none !important; }
@@ -81,10 +103,7 @@ button.api-link, .api-docs, a[href*="/api/"] { display: none !important; }
81
 
82
  with gr.Blocks(title="Speech to Text", css=CSS) as demo:
83
  with gr.Tab("Speech to Text"):
84
- gr.Markdown(
85
- "Дуу бичих эсвэл audio файл оруулсанаар хэлсэн текстийг "
86
- "гаргаж өгнө. Доорх талбараас audio оруулаад **Илгээх** дарна уу."
87
- )
88
  with gr.Row():
89
  with gr.Column(scale=1):
90
  audio_in = gr.Audio(
 
1
  import os
2
 
3
  import gradio as gr
4
+ import librosa
5
+ import spaces
6
  import torch
7
  from huggingface_hub import login
8
  from transformers import (
9
  WhisperFeatureExtractor,
10
  WhisperForConditionalGeneration,
11
  WhisperTokenizerFast,
 
12
  )
13
 
14
 
 
30
  login(token=HF_TOKEN)
31
 
32
  MODEL_REPO = os.getenv("MODEL_REPO_ID", "Batuka0901/MN_ASR")
33
+ SAMPLING_RATE = 16000
 
34
 
35
+ print(f"Loading {MODEL_REPO} (CPU at startup) ...")
36
  model = WhisperForConditionalGeneration.from_pretrained(MODEL_REPO, token=HF_TOKEN)
37
+ model.eval()
38
  tokenizer = WhisperTokenizerFast.from_pretrained(MODEL_REPO, token=HF_TOKEN)
39
+ feature_extractor = WhisperFeatureExtractor.from_pretrained(MODEL_REPO, token=HF_TOKEN)
 
 
 
 
 
 
 
40
  print("Model loaded.")
41
+ _moved_to_cuda = False
42
 
43
 
44
  WAITING = "Төлөв: **Аудио хүлээж байна...**"
 
47
  DONE = "Төлөв: **Дууссан.**"
48
 
49
 
50
+ @spaces.GPU(duration=60)
51
  def transcribe(audio_path):
52
+ global _moved_to_cuda
53
  if not audio_path:
54
  return "", WAITING
55
  try:
56
+ if not _moved_to_cuda and torch.cuda.is_available():
57
+ model.to("cuda")
58
+ _moved_to_cuda = True
59
+ device = "cuda" if (_moved_to_cuda and torch.cuda.is_available()) else "cpu"
60
+
61
+ audio, _ = librosa.load(audio_path, sr=SAMPLING_RATE)
62
+ inputs = feature_extractor(
63
+ audio, sampling_rate=SAMPLING_RATE, return_tensors="pt"
64
+ )
65
+ input_features = inputs.input_features.to(device)
66
+
67
+ with torch.no_grad():
68
+ predicted_ids = model.generate(
69
+ input_features, language="mn", task="transcribe"
70
+ )
71
+ text = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)[0]
72
+ return text.strip(), DONE
73
  except Exception as e:
74
  return "", f"Төлөв: **Алдаа** — {type(e).__name__}: {e}"
75
 
 
84
  return gr.update(interactive=False), WAITING
85
 
86
 
87
+ INSTRUCTIONS = """
88
+ ### Заавар
89
+
90
+ 1. **Audio оруулна уу** — файл upload хийх эсвэл микрофоноор шууд бичлэг хийнэ
91
+ 2. **Илгээх** товчийг дарна — таны хэлсэн үгийг загвар таниж текст болгоно
92
+ 3. **Гаралт** хэсгээс таниулсан текстийг авч хуулна
93
+ 4. Шинээр оролдох бол **Clear** дараад дахин эхлүүлнэ
94
+
95
+ > Сайн тод бичлэг + чимээгүй орчин таних чанарыг сайжруулна.
96
+ """
97
+
98
  CSS = """
99
  footer { display: none !important; visibility: hidden !important; }
100
  .gradio-container > .footer { display: none !important; }
 
103
 
104
  with gr.Blocks(title="Speech to Text", css=CSS) as demo:
105
  with gr.Tab("Speech to Text"):
106
+ gr.Markdown(INSTRUCTIONS)
 
 
 
107
  with gr.Row():
108
  with gr.Column(scale=1):
109
  audio_in = gr.Audio(
requirements.txt CHANGED
@@ -6,3 +6,4 @@ gradio==5.49.1
6
  librosa
7
  numpy<2.0
8
  soundfile
 
 
6
  librosa
7
  numpy<2.0
8
  soundfile
9
+ spaces